In [1]:
import argparse
import pymongo
import os
from dotenv import load_dotenv
import pandas as pd
import polars as pl

In [2]:
def init_database() -> pymongo.MongoClient:
    host = os.getenv("MONGO_HOST") or "localhost"
    port = os.getenv("MONGO_PORT") or "27017"
    db_name = os.getenv("MONGO_DB_NAME") or "sns-fake-content"

    client = pymongo.MongoClient(f"mongodb://{host}:{port}/")

    if db_name not in client.list_database_names():
        client.drop_database(db_name)

    return client

In [4]:
client = init_database()

In [110]:
db = client["sns-fake-content"]

if "users" in db.list_collection_names():
	db.drop_collection("users")
users_col = db["users"]

if "hashtags" in db.list_collection_names():
	db.drop_collection("hashtags")
hashtags_col = db["hashtags"]

In [23]:
users = pl.read_csv('../dataset/FakeNewsNet/data/users/master_users.csv')
print(users.columns)

['', 'protected', 'username', 'description', 'verified', 'location', 'name', 'profile_image_url', 'id', 'created_at', 'entities.description.urls', 'entities.description.hashtags', 'public_metrics.followers_count', 'public_metrics.following_count', 'public_metrics.tweet_count', 'public_metrics.listed_count', 'url', 'entities.url.urls', 'pinned_tweet_id', 'entities.description.mentions', 'entities.description.cashtags', 'withheld.country_codes', 'withheld.scope', 'source']


## Insert Users

In [69]:
import json

In [76]:
users_column = [
    "username", 
    "created_at",
    'verified',
    'profile_image_url',
    'name',
    'description',
    pl.col('public_metrics.followers_count').alias('followers_count'),
    pl.col('public_metrics.following_count').alias('following_count'),
    pl.col('public_metrics.tweet_count').alias('tweet_count'),
    pl.col('public_metrics.listed_count').alias('listed_count'),
    'protected',
    'source',
]
users_col.insert_many(users.select(users_column).to_dicts())

<pymongo.results.InsertManyResult at 0x7f9fcc3c1670>

In [111]:
users_col.find_one()

## Insert Entities

In [107]:
hashtags = users.filter(pl.col('entities.description.hashtags').is_not_null())\
    .select(pl.col('id'), pl.col('entities.description.hashtags').alias('hashtag')).to_dicts()
hashtags

[{'id': 458320553,
  'hashtag': "[{'start': 20, 'end': 23, 'tag': 'T5'}, {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}]"},
 {'id': 2244289393,
  'hashtag': "[{'start': 15, 'end': 27, 'tag': 'archSTORIES'}, {'start': 37, 'end': 47, 'tag': 'archTALKS'}, {'start': 59, 'end': 68, 'tag': 'archTIPS'}, {'start': 81, 'end': 90, 'tag': 'archEATS'}, {'start': 136, 'end': 147, 'tag': 'TheArchWay'}]"},
 {'id': 1961751636,
  'hashtag': "[{'start': 67, 'end': 75, 'tag': 'giforce'}, {'start': 76, 'end': 90, 'tag': 'TeamHadidNews'}]"},
 {'id': 403204605,
  'hashtag': "[{'start': 0, 'end': 4, 'tag': 'gmm'}, {'start': 5, 'end': 17, 'tag': 'earbiscuits'}, {'start': 18, 'end': 30, 'tag': 'alwayssunny'}, {'start': 31, 'end': 43, 'tag': '2bears1cave'}, {'start': 44, 'end': 55, 'tag': 'disjointed'}, {'start': 56, 'end': 65, 'tag': 'goniners'}, {'start': 66, 'end': 80, 'tag': 'herecomesduke'}, {'start': 81, 'end': 97, 'tag': 'unclejoeysjoint'}, {'start': 98, 'end': 116, 'tag': 'cannabiscommu

In [108]:
hashtags = list(map(lambda x: {**x, 'hashtag_raw': json.loads(x['hashtag'].replace("\'", '\"')), 'hashtags': list(map(lambda y: y['tag'],json.loads(x['hashtag'].replace("\'", '\"'))))}, hashtags))

In [109]:
hashtags

[{'id': 458320553,
  'hashtag': ['T5', 'TerrificTouchontheTurnTables'],
  'hashtag_raw': [{'start': 20, 'end': 23, 'tag': 'T5'},
   {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}]},
 {'id': 2244289393,
  'hashtag': ['archSTORIES',
   'archTALKS',
   'archTIPS',
   'archEATS',
   'TheArchWay'],
  'hashtag_raw': [{'start': 15, 'end': 27, 'tag': 'archSTORIES'},
   {'start': 37, 'end': 47, 'tag': 'archTALKS'},
   {'start': 59, 'end': 68, 'tag': 'archTIPS'},
   {'start': 81, 'end': 90, 'tag': 'archEATS'},
   {'start': 136, 'end': 147, 'tag': 'TheArchWay'}]},
 {'id': 1961751636,
  'hashtag': ['giforce', 'TeamHadidNews'],
  'hashtag_raw': [{'start': 67, 'end': 75, 'tag': 'giforce'},
   {'start': 76, 'end': 90, 'tag': 'TeamHadidNews'}]},
 {'id': 403204605,
  'hashtag': ['gmm',
   'earbiscuits',
   'alwayssunny',
   '2bears1cave',
   'disjointed',
   'goniners',
   'herecomesduke',
   'unclejoeysjoint',
   'cannabiscommunity',
   'ymh',
   'tkats',
   'badfriends',
   'roastme'

In [112]:
hashtags_col.insert_many(hashtags)

<pymongo.results.InsertManyResult at 0x7f9fa85c1940>

In [115]:
hashtags_col.find_one()

{'_id': ObjectId('640e581f3603294868a3d2b1'),
 'id': 458320553,
 'hashtag': ['T5', 'TerrificTouchontheTurnTables'],
 'hashtag_raw': [{'start': 20, 'end': 23, 'tag': 'T5'},
  {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}]}

In [121]:
urls = users.filter(pl.col('entities.description.urls').is_not_null())\
    .select(pl.col('id'), pl.col('entities.description.urls').alias('urls')).to_dicts()
urls

[{'id': 458320553,
  'urls': "[{'start': 120, 'end': 143, 'url': 'https://t.co/JWFNpA4ZFO', 'expanded_url': 'http://www.xliveafrica.com', 'display_url': 'xliveafrica.com'}]"},
 {'id': 248372791,
  'urls': "[{'start': 134, 'end': 157, 'url': 'https://t.co/IrLvgOa1qo', 'expanded_url': 'http://bendaly.co.uk', 'display_url': 'bendaly.co.uk'}]"},
 {'id': 857092418,
  'urls': "[{'start': 80, 'end': 103, 'url': 'https://t.co/UCTfo9adFg', 'expanded_url': 'http://facebook.com/Alfa973/', 'display_url': 'facebook.com/Alfa973/'}, {'start': 108, 'end': 131, 'url': 'https://t.co/ooY3lKgxp4', 'expanded_url': 'http://instagram.com/alfa97.3/', 'display_url': 'instagram.com/alfa97.3/'}]"},
 {'id': 182777119,
  'urls': "[{'start': 68, 'end': 91, 'url': 'https://t.co/JqTB2h4yFl', 'expanded_url': 'https://www.patreon.com/xahlee', 'display_url': 'patreon.com/xahlee'}]"},
 {'id': 28462473,
  'urls': "[{'start': 121, 'end': 144, 'url': 'https://t.co/aIPwXxKx5B', 'expanded_url': 'https://fandom.ink/@mkelliotmk

In [124]:
urls = list(
    map(
        lambda x: {
            **x,
            "url_raw": json.loads(x["urls"].replace("\'", '\"')),
            "urls": list(
                map(
                    lambda y: y["expanded_url"], json.loads(x["urls"].replace("'", '"'))
                )
            ),
        },
        hashtags,
    )
)


JSONDecodeError: Invalid \escape: line 1 column 127 (char 126)