In [1]:
import argparse
import pymongo
import os
from dotenv import load_dotenv
import pandas as pd
import polars as pl

In [2]:
def init_database() -> pymongo.MongoClient:
    host = os.getenv("MONGO_HOST") or "localhost"
    port = os.getenv("MONGO_PORT") or "27017"
    db_name = os.getenv("MONGO_DB_NAME") or "sns-fake-content"

    client = pymongo.MongoClient(f"mongodb://{host}:{port}/")

    if db_name not in client.list_database_names():
        client.drop_database(db_name)

    return client

In [3]:
client = init_database()

In [4]:
db = client["sns-fake-content"]

if "users" in db.list_collection_names():
	db.drop_collection("users")
users_col = db["users"]

if "hashtags" in db.list_collection_names():
	db.drop_collection("hashtags")
hashtags_col = db["hashtags"]

if "urls" in db.list_collection_names():
	db.drop_collection("urls")
urls_col = db["urls"]

if "cashtags" in db.list_collection_names():
	db.drop_collection("cashtags")
cashtags_col = db["cashtags"]

In [5]:
users = pl.read_csv('../dataset/FakeNewsNet/data/users/master_users.csv')
print(users.columns)

['', 'protected', 'username', 'description', 'verified', 'location', 'name', 'profile_image_url', 'id', 'created_at', 'entities.description.urls', 'entities.description.hashtags', 'public_metrics.followers_count', 'public_metrics.following_count', 'public_metrics.tweet_count', 'public_metrics.listed_count', 'url', 'entities.url.urls', 'pinned_tweet_id', 'entities.description.mentions', 'entities.description.cashtags', 'withheld.country_codes', 'withheld.scope', 'source']


## Insert Users

In [205]:
import json

In [206]:
users_column = [
    "username", 
    "created_at",
    'verified',
    'profile_image_url',
    'name',
    'description',
    pl.col('public_metrics.followers_count').alias('followers_count'),
    pl.col('public_metrics.following_count').alias('following_count'),
    pl.col('public_metrics.tweet_count').alias('tweet_count'),
    pl.col('public_metrics.listed_count').alias('listed_count'),
    'pinned_tweet_id',
    'protected',
    'source',
    'withheld.country_codes'
]
users_col.insert_many(users.select(users_column).to_dicts())

<pymongo.results.InsertManyResult at 0x7f36a873fdc0>

In [207]:
users_col.find_one()

{'_id': ObjectId('640fa980110cf4d9327967ec'),
 'username': 'DJTHEOGH',
 'created_at': '2012-01-08T12:34:44.000Z',
 'verified': False,
 'profile_image_url': 'https://pbs.twimg.com/profile_images/762324645207040004/gCWUPJ49_normal.jpg',
 'name': 'DJ Theo gh',
 'description': 'DJ extraordinaire | #T5 on the Xpress breakfast show | #TerrificTouchontheTurnTables | 6:00 - 9:00 a.m | Xlive Africa | https://t.co/JWFNpA4ZFO',
 'followers_count': 259,
 'following_count': 473,
 'tweet_count': 19989,
 'listed_count': 14,
 'pinned_tweet_id': None,
 'protected': False,
 'source': 'gossipcop_fake',
 'withheld.country_codes': None}

## Insert Entities

In [208]:
hashtags = users.filter(pl.col('entities.description.hashtags').is_not_null())\
    .select(pl.col('id').alias('user_id'), pl.col('entities.description.hashtags').alias('hashtag')).to_dicts()
hashtags

[{'user_id': 458320553,
  'hashtag': "[{'start': 20, 'end': 23, 'tag': 'T5'}, {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}]"},
 {'user_id': 2244289393,
  'hashtag': "[{'start': 15, 'end': 27, 'tag': 'archSTORIES'}, {'start': 37, 'end': 47, 'tag': 'archTALKS'}, {'start': 59, 'end': 68, 'tag': 'archTIPS'}, {'start': 81, 'end': 90, 'tag': 'archEATS'}, {'start': 136, 'end': 147, 'tag': 'TheArchWay'}]"},
 {'user_id': 1961751636,
  'hashtag': "[{'start': 67, 'end': 75, 'tag': 'giforce'}, {'start': 76, 'end': 90, 'tag': 'TeamHadidNews'}]"},
 {'user_id': 403204605,
  'hashtag': "[{'start': 0, 'end': 4, 'tag': 'gmm'}, {'start': 5, 'end': 17, 'tag': 'earbiscuits'}, {'start': 18, 'end': 30, 'tag': 'alwayssunny'}, {'start': 31, 'end': 43, 'tag': '2bears1cave'}, {'start': 44, 'end': 55, 'tag': 'disjointed'}, {'start': 56, 'end': 65, 'tag': 'goniners'}, {'start': 66, 'end': 80, 'tag': 'herecomesduke'}, {'start': 81, 'end': 97, 'tag': 'unclejoeysjoint'}, {'start': 98, 'end': 116, '

In [209]:
hashtags = list(map(lambda x: {**x, 'hashtag_raw': json.loads(x['hashtag'].replace("\'", '\"')), 'hashtags': list(map(lambda y: y['tag'],json.loads(x['hashtag'].replace("\'", '\"'))))}, hashtags))

In [210]:
hashtags

[{'user_id': 458320553,
  'hashtag': "[{'start': 20, 'end': 23, 'tag': 'T5'}, {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}]",
  'hashtag_raw': [{'start': 20, 'end': 23, 'tag': 'T5'},
   {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}],
  'hashtags': ['T5', 'TerrificTouchontheTurnTables']},
 {'user_id': 2244289393,
  'hashtag': "[{'start': 15, 'end': 27, 'tag': 'archSTORIES'}, {'start': 37, 'end': 47, 'tag': 'archTALKS'}, {'start': 59, 'end': 68, 'tag': 'archTIPS'}, {'start': 81, 'end': 90, 'tag': 'archEATS'}, {'start': 136, 'end': 147, 'tag': 'TheArchWay'}]",
  'hashtag_raw': [{'start': 15, 'end': 27, 'tag': 'archSTORIES'},
   {'start': 37, 'end': 47, 'tag': 'archTALKS'},
   {'start': 59, 'end': 68, 'tag': 'archTIPS'},
   {'start': 81, 'end': 90, 'tag': 'archEATS'},
   {'start': 136, 'end': 147, 'tag': 'TheArchWay'}],
  'hashtags': ['archSTORIES',
   'archTALKS',
   'archTIPS',
   'archEATS',
   'TheArchWay']},
 {'user_id': 1961751636,
  'hashtag': "[

In [211]:
hashtags_col.insert_many(hashtags)

<pymongo.results.InsertManyResult at 0x7f369368bfd0>

In [212]:
hashtags_col.find_one()

{'_id': ObjectId('640fa98a110cf4d93282ad4d'),
 'user_id': 458320553,
 'hashtag': "[{'start': 20, 'end': 23, 'tag': 'T5'}, {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}]",
 'hashtag_raw': [{'start': 20, 'end': 23, 'tag': 'T5'},
  {'start': 55, 'end': 84, 'tag': 'TerrificTouchontheTurnTables'}],
 'hashtags': ['T5', 'TerrificTouchontheTurnTables']}

In [213]:
urls = users.filter(pl.col('entities.description.urls').is_not_null())\
    .select(pl.col('id').alias('user_id'), pl.col('entities.description.urls').alias('urls')).to_dicts()
urls

[{'user_id': 458320553,
  'urls': "[{'start': 120, 'end': 143, 'url': 'https://t.co/JWFNpA4ZFO', 'expanded_url': 'http://www.xliveafrica.com', 'display_url': 'xliveafrica.com'}]"},
 {'user_id': 248372791,
  'urls': "[{'start': 134, 'end': 157, 'url': 'https://t.co/IrLvgOa1qo', 'expanded_url': 'http://bendaly.co.uk', 'display_url': 'bendaly.co.uk'}]"},
 {'user_id': 857092418,
  'urls': "[{'start': 80, 'end': 103, 'url': 'https://t.co/UCTfo9adFg', 'expanded_url': 'http://facebook.com/Alfa973/', 'display_url': 'facebook.com/Alfa973/'}, {'start': 108, 'end': 131, 'url': 'https://t.co/ooY3lKgxp4', 'expanded_url': 'http://instagram.com/alfa97.3/', 'display_url': 'instagram.com/alfa97.3/'}]"},
 {'user_id': 182777119,
  'urls': "[{'start': 68, 'end': 91, 'url': 'https://t.co/JqTB2h4yFl', 'expanded_url': 'https://www.patreon.com/xahlee', 'display_url': 'patreon.com/xahlee'}]"},
 {'user_id': 28462473,
  'urls': "[{'start': 121, 'end': 144, 'url': 'https://t.co/aIPwXxKx5B', 'expanded_url': 'https

In [214]:
def tmp(x):
    return json.loads(
        x["urls"]
        .replace("'", '"')
        .encode("utf-8")
        .decode("unicode_escape")
        .replace('I"m', "I'm")
        .replace('n"s', "n's")
    )


processed_urls = list(
    map(
        lambda x: {
            **x,
            "raw_urls": x["urls"],
            "urls": tmp(x),
        },
        urls,
    )
)
processed_urls


[{'user_id': 458320553,
  'urls': [{'start': 120,
    'end': 143,
    'url': 'https://t.co/JWFNpA4ZFO',
    'expanded_url': 'http://www.xliveafrica.com',
    'display_url': 'xliveafrica.com'}],
  'raw_urls': "[{'start': 120, 'end': 143, 'url': 'https://t.co/JWFNpA4ZFO', 'expanded_url': 'http://www.xliveafrica.com', 'display_url': 'xliveafrica.com'}]"},
 {'user_id': 248372791,
  'urls': [{'start': 134,
    'end': 157,
    'url': 'https://t.co/IrLvgOa1qo',
    'expanded_url': 'http://bendaly.co.uk',
    'display_url': 'bendaly.co.uk'}],
  'raw_urls': "[{'start': 134, 'end': 157, 'url': 'https://t.co/IrLvgOa1qo', 'expanded_url': 'http://bendaly.co.uk', 'display_url': 'bendaly.co.uk'}]"},
 {'user_id': 857092418,
  'urls': [{'start': 80,
    'end': 103,
    'url': 'https://t.co/UCTfo9adFg',
    'expanded_url': 'http://facebook.com/Alfa973/',
    'display_url': 'facebook.com/Alfa973/'},
   {'start': 108,
    'end': 131,
    'url': 'https://t.co/ooY3lKgxp4',
    'expanded_url': 'http://instag

In [215]:
urls_col.insert_many(processed_urls)

<pymongo.results.InsertManyResult at 0x7f3691f78d90>

In [216]:
cashtags = users.filter(pl.col('entities.description.cashtags').is_not_null())\
    .select(pl.col('id').alias('user_id'), pl.col('entities.description.cashtags').alias('cashtags')).to_dicts()
cashtags

[{'user_id': 195746945,
  'cashtags': "[{'start': 124, 'end': 128, 'tag': 'PRC'}]"},
 {'user_id': 377170397,
  'cashtags': "[{'start': 20, 'end': 25, 'tag': 'ATOM'}, {'start': 26, 'end': 31, 'tag': 'JUNO'}, {'start': 32, 'end': 37, 'tag': 'SCRT'}, {'start': 38, 'end': 43, 'tag': 'GRAV'}]"},
 {'user_id': 20550490,
  'cashtags': "[{'start': 87, 'end': 91, 'tag': 'AMC'}, {'start': 92, 'end': 96, 'tag': 'APE'}]"},
 {'user_id': 14833599,
  'cashtags': "[{'start': 46, 'end': 51, 'tag': 'URBN'}, {'start': 53, 'end': 57, 'tag': 'WMT'}, {'start': 65, 'end': 72, 'tag': 'APT.AX'}]"},
 {'user_id': 327696538,
  'cashtags': "[{'start': 72, 'end': 78, 'tag': 'TSLAQ'}]"},
 {'user_id': 144320244,
  'cashtags': "[{'start': 6, 'end': 11, 'tag': 'HOGE'}]"},
 {'user_id': 135013372,
  'cashtags': "[{'start': 60, 'end': 66, 'tag': 'DimiD'}]"},
 {'user_id': 1029588884911521792,
  'cashtags': "[{'start': 15, 'end': 20, 'tag': 'PENT'}]"},
 {'user_id': 2489830678,
  'cashtags': "[{'start': 56, 'end': 63, 'tag': 

In [217]:
cashtags = list(map(lambda x: {**x, 'cashtag_raw': json.loads(x['cashtags'].replace("\'", '\"')), 'cashtags': list(map(lambda y: y['tag'],json.loads(x['cashtags'].replace("\'", '\"'))))}, cashtags))
cashtags

[{'user_id': 195746945,
  'cashtags': ['PRC'],
  'cashtag_raw': [{'start': 124, 'end': 128, 'tag': 'PRC'}]},
 {'user_id': 377170397,
  'cashtags': ['ATOM', 'JUNO', 'SCRT', 'GRAV'],
  'cashtag_raw': [{'start': 20, 'end': 25, 'tag': 'ATOM'},
   {'start': 26, 'end': 31, 'tag': 'JUNO'},
   {'start': 32, 'end': 37, 'tag': 'SCRT'},
   {'start': 38, 'end': 43, 'tag': 'GRAV'}]},
 {'user_id': 20550490,
  'cashtags': ['AMC', 'APE'],
  'cashtag_raw': [{'start': 87, 'end': 91, 'tag': 'AMC'},
   {'start': 92, 'end': 96, 'tag': 'APE'}]},
 {'user_id': 14833599,
  'cashtags': ['URBN', 'WMT', 'APT.AX'],
  'cashtag_raw': [{'start': 46, 'end': 51, 'tag': 'URBN'},
   {'start': 53, 'end': 57, 'tag': 'WMT'},
   {'start': 65, 'end': 72, 'tag': 'APT.AX'}]},
 {'user_id': 327696538,
  'cashtags': ['TSLAQ'],
  'cashtag_raw': [{'start': 72, 'end': 78, 'tag': 'TSLAQ'}]},
 {'user_id': 144320244,
  'cashtags': ['HOGE'],
  'cashtag_raw': [{'start': 6, 'end': 11, 'tag': 'HOGE'}]},
 {'user_id': 135013372,
  'cashtags':

In [218]:
cashtags_col.insert_many(cashtags)

<pymongo.results.InsertManyResult at 0x7f3676f27fd0>

In [219]:
# cashtags = users.filter(pl.col('entities.description.cashtags').is_not_null())\
#     .select(pl.col('id').alias('user_id'), pl.col('entities.description.cashtags').alias('cashtags')).to_dicts()
users.filter(pl.col('withheld.country_codes').is_not_null())

Unnamed: 0_level_0,protected,username,description,verified,location,name,profile_image_url,id,created_at,entities.description.urls,entities.description.hashtags,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,url,entities.url.urls,pinned_tweet_id,entities.description.mentions,entities.description.cashtags,withheld.country_codes,withheld.scope,source
i64,bool,str,str,bool,str,str,str,i64,str,str,str,i64,i64,i64,i64,str,str,i64,str,str,str,str,str
1834,false,"""ActualidadRT""","""El primer cana...",true,,"""RT en Español""","""https://pbs.tw...",100731315,"""2009-12-31T09:...",,,3495775,67,810333,12340,"""https://t.co/b...","""[{'start': 0, ...",1498676599562063875,,,"""['AT', 'BE', '...",,"""gossipcop_fake..."
16451,false,"""KorsonWolFFXXX...","""THIS PROFILE C...",false,"""Big Apple, USA...","""Korson WolFF""","""https://pbs.tw...",71555905,"""2009-09-04T15:...",,"""[{'start': 133...",1092,4914,23505,6,,,950081948113887232,,,"""['ID']""",,"""gossipcop_fake..."
18369,false,"""recentpoker""","""Love gambling,...",false,"""United States""","""Fuck Around an...","""https://pbs.tw...",14231320,"""2008-03-27T00:...",,"""[{'start': 63,...",6047,1182,161468,136,"""https://t.co/k...","""[{'start': 0, ...",,,,"""['RU']""",,"""gossipcop_fake..."
19201,true,"""SushiCxyrz""",,false,"""Pergatory""","""Bax""","""https://pbs.tw...",776290840717070336,"""2016-09-15T05:...",,,17998,95,240,72,,,1471441197952425988,,,"""['ID']""",,"""gossipcop_fake..."
34966,false,"""zana_medi""","""click @zanaame...",false,"""Syria""",""".""","""https://pbs.tw...",1038078139,"""2012-12-26T21:...",,,14249,1254,34318,263,"""https://t.co/j...","""[{'start': 0, ...",1338953503616749569,"""[{'start': 6, ...",,"""['TR']""",,"""gossipcop_fake..."
49218,false,"""RTSportNews""","""We are RT Spor...",true,,"""RT Sport""","""https://pbs.tw...",3760236855,"""2015-09-24T12:...",,,20977,642,20365,245,"""https://t.co/f...","""[{'start': 0, ...",,,,"""['AT', 'BE', '...",,"""gossipcop_fake..."
49376,false,"""AmyMek""","""Investigative ...",false,"""NYC""","""Amy Mek""","""https://pbs.tw...",954124423,"""2012-11-17T18:...","""[{'start': 132...",,257024,3552,56352,1267,"""https://t.co/W...","""[{'start': 0, ...",1610859717399830528,,,"""['DE', 'FR']""",,"""gossipcop_fake..."
55490,false,"""derasachasauda...","""Confluence of ...",false,"""Sirsa""","""Dera Sacha Sau...","""https://pbs.tw...",82613482,"""2009-10-15T13:...",,,344365,1,5620,99,"""https://t.co/f...","""[{'start': 0, ...",,,,"""['IN']""",,"""gossipcop_fake..."
64335,false,"""Aadi_n""",,false,"""Pakistan""","""Syed Mohammad ...","""https://pbs.tw...",304067927,"""2011-05-23T21:...",,,3068,3245,270972,94,"""https://t.co/r...","""[{'start': 0, ...",1070255951222071296,,,"""['IN']""",,"""gossipcop_fake..."
69627,false,"""Gurmeetramrahi...","""Spiritual Sain...",true,"""sirsa(haryana)...","""Dr.GURMEET RAM...","""https://pbs.tw...",2852359916,"""2014-10-11T20:...",,,1182215,0,2901,1202,"""https://t.co/e...","""[{'start': 0, ...",,,,"""['IN']""",,"""gossipcop_fake..."


## Tweeets

In [5]:
tweets = pd.read_pickle('../dataset/processed_data/master_fnn.pkl')
tweets.to_csv('../dataset/processed_data/master_fnn.csv', index=False)

In [22]:
tweets = pl.read_csv('../dataset/processed_data/master_fnn.csv')

In [2]:
tweets = pd.read_pickle('../dataset/processed_data/master_fnn.pkl')

In [4]:
tweets

Unnamed: 0,lang,id,entities,public_metrics,context_annotations,possibly_sensitive,created_at,author_id,text,conversation_id,edit_history_tweet_ids,reply_settings,in_reply_to_user_id,referenced_tweets,geo,withheld,label
0,en,1029123395739414528,"{'annotations': [{'start': 12, 'end': 24, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '10', 'name': 'Person', 'de...",False,2018-08-13 21:51:52,1012203358512443392,On Air with Ryan Seacrest is offering you a ch...,1029123395739414528,[1029123395739414529],everyone,,,,,false
1,en,998353516434518016,"{'hashtags': [{'start': 110, 'end': 116, 'tag'...","{'retweet_count': 1, 'reply_count': 1, 'like_c...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",False,2018-05-21 00:03:21,829904857305927680,‘American Idol’ final: How to vote for the sea...,998353516434518016,[998353516434518016],everyone,,,,,false
2,en,1051158211208736768,"{'annotations': [{'start': 82, 'end': 87, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",False,2018-10-13 17:10:21,31259532,@ScottDisick @KrisJenner @khloekardashian — LA...,1051158211208736768,[1051158211208736768],everyone,1.019284e+08,"[{'type': 'quoted', 'id': '1050443040668770304'}]",,,false
3,en,1011368336804937728,"{'annotations': [{'start': 10, 'end': 19, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '10', 'name': 'Person', 'de...",False,2018-06-25 21:59:36,194346085,@foquinha Youngblood - 5 Seconds of Summer \nO...,1011289623119716352,[1011368336804937728],everyone,1.814137e+07,"[{'type': 'replied_to', 'id': '101128962311971...",,,false
4,en,954584822474838016,"{'annotations': [{'start': 0, 'end': 11, 'prob...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '10', 'name': 'Person', 'de...",True,2018-01-20 05:22:11,31469390,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,954584822474838016,[954584822474838016],everyone,,,,,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1434570,en,1062509154118569984,"{'mentions': [{'start': 0, 'end': 10, 'usernam...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",False,2018-11-14 00:54:57,25635248,@NBCNewsPR @MeetThePress @chucktodd @RepAdamSc...,1062397000023527424,[1062509154118569985],everyone,1.024803e+08,"[{'type': 'replied_to', 'id': '106239700002352...",,,true
1434571,en,1012233506691284992,"{'annotations': [{'start': 38, 'end': 52, 'pro...","{'retweet_count': 2, 'reply_count': 0, 'like_c...",,False,2018-06-28 07:17:29,325328148,Have you seen our Spotlight report on Initial ...,1012233506691284992,[1012233506691284992],everyone,,,,,true
1434572,en,836898590714916864,"{'annotations': [{'start': 10, 'end': 14, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '35', 'name': 'Politician',...",False,2017-03-01 11:19:07,22367040,President Trump’s First Address to a Joint Ses...,836898590714916864,[836898590714916866],everyone,,,,,true
1434573,it,1021114709674725376,"{'mentions': [{'start': 0, 'end': 16, 'usernam...","{'retweet_count': 0, 'reply_count': 1, 'like_c...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",False,2018-07-22 19:28:12,4709603859,@GianMarcoMelosu @SkySportF1 @LewisHamilton @M...,1021089042786594816,[1021114709674725382],everyone,2.377196e+09,"[{'type': 'replied_to', 'id': '102109095367800...",,,true


In [16]:
tweets.columns

Index(['lang', 'id', 'entities', 'public_metrics', 'context_annotations',
       'possibly_sensitive', 'created_at', 'author_id', 'text',
       'conversation_id', 'edit_history_tweet_ids', 'reply_settings',
       'in_reply_to_user_id', 'referenced_tweets', 'geo', 'withheld', 'label'],
      dtype='object')

In [48]:
tweets_pd.dtypes

text    object
dtype: object

In [99]:
tweets_pd = tweets[
    [
        "lang",
        "id",
        "entities",
        "public_metrics",
        "context_annotations",
        "possibly_sensitive",
        "created_at",
        "author_id",
        "text",
        "conversation_id",
        "edit_history_tweet_ids",
        "reply_settings",
        "in_reply_to_user_id",
        "referenced_tweets",
        "geo",
        "withheld",
        "label",
    ]
]
# tweets_pd[1265792:1265794]
tweets_pd['text'] = tweets_pd['text'].astype(str)


In [100]:
tweets_pl = pl.from_pandas(tweets_pd, schema_overrides={"text": pl.Utf8})
tweets_pl.head()

lang,id,entities,public_metrics,context_annotations,possibly_sensitive,created_at,author_id,text,conversation_id,edit_history_tweet_ids,reply_settings,in_reply_to_user_id,referenced_tweets,geo,withheld,label
str,i64,struct[5],struct[5],list[struct[2]],bool,datetime[ns],i64,str,i64,list[str],str,f64,list[struct[2]],struct[1],struct[3],str
"""en""",1029123395739414528,"{[{24,""Ryan Seacrest"",0.6396,12,""Other""}, {83,""Ryan"",0.549,80,""Other""}, {124,""Capital One® Quicksilver®"",0.6888,100,""Other""}],null,[{174,162,""sweepstakes""}],null,[{null,""ul.ink/E3ZP-5MNDM2_T"",198,""http://ul.ink/E3ZP-5MNDM2_T"",null,null,175,null,null,null,""https://t.co/IMouHOWBuy""}]}","{0,0,0,0,0}","[{{""Named people in the world like Nelson Mandela"",""10"",""Person""},{""Ryan Seacrest"",""808677941294968833"",""Ryan Seacrest""}}, {{""Top level entities that describe a Brands industry"",""45"",""Brand Vertical""},{null,""781974596148793345"",""Business & finance""}}, ... {{""An entertainment personality in the world, like Anderson Cooper or Miranda Sings"",""58"",""Entertainment Personality""},{""Ryan Seacrest"",""808677941294968833"",""Ryan Seacrest""}}]",False,2018-08-13 21:51:52,1012203358512443392,"""On Air with Ry...",1029123395739414528,"[""1029123395739414529""]","""everyone""",,,{null},"{null,null,null}","""false"""
"""en""",998353516434518016,"{[{13,""American Idol"",0.917,1,""Other""}, {80,""GoldDerby"",0.4798,72,""Person""}, {115,""Gabby"",0.6023,111,""Person""}],null,[{116,110,""Gabby""}],null,[{null,""goldderby.com/article/2018/a…"",105,""http://www.goldderby.com/article/2018/american-idol-how-to-vote-may-20-winner-season-16/#utm_medium=social&utm_source=twitter&utm_campaign=social_bar&utm_content=top_amp&utm_id=1202550373"",null,null,82,null,null,null,""https://t.co/h2ur471euv""}]}","{0,1,0,1,1}","[{{""Television shows from around the world"",""3"",""TV Shows""},{""Judges Luke Bryan, Katy Perry and Lionel Richie set out to discover the next American Idol. Photo via @AmericanIdol"",""10001083292"",""American Idol""}}, {{""Television show episodes"",""4"",""TV Episodes""},{null,""10053565198"",""Performance Finals""}}]",False,2018-05-21 00:03:21,829904857305927680,"""‘American Idol...",998353516434518016,"[""998353516434518016""]","""everyone""",,,{null},"{null,null,null}","""false"""
"""en""",1051158211208736768,"{[{87,""Banksy"",0.3711,82,""Person""}, {104,""KUWTK"",0.4415,100,""Person""}, ... {130,""Kardashian"",0.9158,121,""Person""}],null,[{88,81,""Banksy""}, {98,89,""Sothebys""}, ... {131,120,""Kardashian""}],[{12,""101928415"",0,""ScottDisick""}, {24,""23613479"",13,""KrisJenner""}, {41,""32959253"",25,""khloekardashian""}],[{null,""twitter.com/SidArthurNYC/s…"",155,""https://twitter.com/SidArthurNYC/status/1050443040668770304"",null,null,132,null,null,null,""https://t.co/qsiMhf6OdF""}]}","{0,0,0,0,0}","[{{""Television shows from around the world"",""3"",""TV Shows""},{""Life with the Kardashian family. Photo via @KUWTK"",""10000283194"",""Keeping Up With the Kardashians""}}, {{""Television shows from around the world"",""3"",""TV Shows""},{null,""10035545983"",""Keeping Up With the Kardashians (Ire)""}}, ... {{""An entertainment personality in the world, like Anderson Cooper or Miranda Sings"",""58"",""Entertainment Personality""},{""Scott Disick"",""808724992489725952"",""Scott Disick""}}]",False,2018-10-13 17:10:21,31259532,"""@ScottDisick @...",1051158211208736768,"[""1051158211208736768""]","""everyone""",101928415.0,"[{""1050443040668770304"",""quoted""}]",{null},"{null,null,null}","""false"""
"""en""",1011368336804937728,"{[{19,""Youngblood"",0.8729,10,""Other""}, {51,""5 Seconds of Summer Only You"",0.9267,23,""Other""}, ... {216,""ZAYN"",0.6774,213,""Person""}],null,null,[{9,""18141369"",0,""foquinha""}],null}","{0,0,0,0,0}","[{{""Named people in the world like Nelson Mandela"",""10"",""Person""},{""CNCO"",""959432183671808002"",""CNCO""}}, {{""A musician in the world, like Adele or Bob Dylan"",""54"",""Musician""},{""CNCO"",""959432183671808002"",""CNCO""}}, ... {{""An actor or actress in the world, like Kate Winslet or Leonardo DiCaprio"",""56"",""Actor""},{""Selena Gomez"",""806556845703766016"",""Selena Gomez""}}]",False,2018-06-25 21:59:36,194346085,"""@foquinha Youn...",1011289623119716352,"[""1011368336804937728""]","""everyone""",18141369.0,"[{""1011289623119716354"",""replied_to""}]",{null},"{null,null,null}","""false"""
"""en""",954584822474838016,"{[{11,""Kylie Jenner"",0.9692,0,""Person""}, {46,""Tyga"",0.562,43,""Person""}],null,null,null,[{null,""fb.me/3PSwChR2r"",108,""https://fb.me/3PSwChR2r"",null,null,85,null,null,null,""https://t.co/LAzr982OBz""}]}","{0,0,0,0,0}","[{{""Named people in the world like Nelson Mandela"",""10"",""Person""},{""Kylie Jenner"",""806579980960350208"",""Kylie Jenner""}}, {{""Named people in the world like Nelson Mandela"",""10"",""Person""},{""Tyga"",""819537325268926464"",""Tyga""}}, ... {{""An entertainment personality in the world, like Anderson Cooper or Miranda Sings"",""58"",""Entertainment Personality""},{""Kylie Jenner"",""806579980960350208"",""Kylie Jenner""}}]",True,2018-01-20 05:22:11,31469390,"""Kylie Jenner ‘...",954584822474838016,"[""954584822474838016""]","""everyone""",,,{null},"{null,null,null}","""false"""


In [112]:
def handle_replied_tweets(x, match_type):
    currentLs = list(map(lambda x: x['id'], filter(lambda z:  z['type'] == match_type,x)))
    return currentLs if len(currentLs) > 0 else None

tweets_pl_extracted = tweets_pl.select([
    pl.col('id').alias('tweet_id'),
    pl.col('text').alias('content'),
    'created_at',
    'author_id',
    'possibly_sensitive',
    'reply_settings',
    'conversation_id',
    'edit_history_tweet_ids',
    'lang',
    'public_metrics',
    'label',
    pl.col('referenced_tweets').apply(lambda x: handle_replied_tweets(x, 'replied_to')).alias('replied_to_ids'),
    pl.col('referenced_tweets').apply(lambda x: handle_replied_tweets(x, 'quoted')).alias('quote_ids'),
]).unnest('public_metrics')

In [165]:
tweets_dict = tweets_pl.to_dicts()

In [125]:
entities = tweets_pl.select([
    pl.col('id').alias('tweet_id'),
    pl.col('entities')
]).unnest('entities')

In [118]:
# Annotation
annotation = (
    entities.select(["id", pl.col("entities").apply(lambda x: x["annotations"]).alias('annotations')])
    .explode("annotations")
    # .select(
    #     pl.struct(["id", "annotations"])
    #     .apply(lambda x: {**x["annotations"], "tweet_id": x["id"]})
    #     .alias("item")
    # )
)


In [134]:
entities.select('tweet_id', 'annotations').explode('annotations')

tweet_id,annotations
i64,struct[5]
1029123395739414528,"{24,""Ryan Seacrest"",0.6396,12,""Other""}"
1029123395739414528,"{83,""Ryan"",0.549,80,""Other""}"
1029123395739414528,"{124,""Capital One® Quicksilver®"",0.6888,100,""Other""}"
998353516434518016,"{13,""American Idol"",0.917,1,""Other""}"
998353516434518016,"{80,""GoldDerby"",0.4798,72,""Person""}"
998353516434518016,"{115,""Gabby"",0.6023,111,""Person""}"
1051158211208736768,"{87,""Banksy"",0.3711,82,""Person""}"
1051158211208736768,"{104,""KUWTK"",0.4415,100,""Person""}"
1051158211208736768,"{111,""Kanye"",0.9624,107,""Person""}"
1051158211208736768,"{118,""Trump"",0.9755,114,""Person""}"


In [135]:
# Hashtag
hashtags = (
    entities.select(["id", pl.col("entities").apply(lambda x: x["hashtags"]).alias('hashtags')])
    .explode("hashtags")
    .select(
        pl.struct(["id", "hashtags"])
        .apply(lambda x: {**x["hashtags"], "tweet_id": x["id"]})
        .alias("item")
    )
)



ColumnNotFoundError: id

Error originated just after this operation:
DF ["tweet_id", "annotations", "cashtags", "hashtags"]; PROJECT */6 COLUMNS; SELECTION: "None"

In [None]:
# Urls
urls = (
    entities.select(["id", pl.col("entities").apply(lambda x: x["urls"]).alias('urls')])
    .explode("urls")
    .select(
        pl.struct(["id", "urls"])
        .apply(lambda x: {**x["urls"], "tweet_id": x["id"]})
        .alias("item")
    )
)


In [None]:
# cashtags
cashtags = (
    entities.select(["id", pl.col("entities").apply(lambda x: x["cashtags"]).alias('cashtags')])
    .explode("cashtags")
    .select(
        pl.struct(["id", "cashtags"])
        .apply(lambda x: {**x["cashtags"], "tweet_id": x["id"]})
        .alias("item")
    )
)


In [None]:
# mentions
mentions = (
    entities.select(["id", pl.col("entities").apply(lambda x: x["mentions"]).alias('mentions')])
    .explode("mentions")
    .select(
        pl.struct(["id", "mentions"])
        .apply(lambda x: {**x["mentions"], "tweet_id": x["id"]})
        .alias("item")
    )
)


In [None]:
def extract_item(x):
    return x['item']

annotation_dict = list(map(extract_item, annotation.to_dicts()))
hashtags_dict = list(map(extract_item, hashtags.to_dicts()))
urls_dict = list(map(extract_item, urls.to_dicts()))
cashtags_dict = list(map(extract_item, cashtags.to_dicts()))
mentions_dict = list(map(extract_item, mentions.to_dicts()))

annotation_dict

[{'end': 24,
  'normalized_text': 'Ryan Seacrest',
  'probability': 0.6396000000000001,
  'start': 12,
  'tweet_id': 1029123395739414528,
  'type': 'Other'},
 {'end': 83,
  'normalized_text': 'Ryan',
  'probability': 0.549,
  'start': 80,
  'tweet_id': 1029123395739414528,
  'type': 'Other'},
 {'end': 124,
  'normalized_text': 'Capital One® Quicksilver®',
  'probability': 0.6888000000000001,
  'start': 100,
  'tweet_id': 1029123395739414528,
  'type': 'Other'},
 {'end': 13,
  'normalized_text': 'American Idol',
  'probability': 0.917,
  'start': 1,
  'tweet_id': 998353516434518016,
  'type': 'Other'},
 {'end': 80,
  'normalized_text': 'GoldDerby',
  'probability': 0.4798,
  'start': 72,
  'tweet_id': 998353516434518016,
  'type': 'Person'},
 {'end': 115,
  'normalized_text': 'Gabby',
  'probability': 0.6023000000000001,
  'start': 111,
  'tweet_id': 998353516434518016,
  'type': 'Person'},
 {'end': 87,
  'normalized_text': 'Banksy',
  'probability': 0.37110000000000004,
  'start': 82,


In [None]:
import json

In [None]:
json.loads("[{'domain': {'id': '10', 'name': 'Person', 'description': 'Named people in the world like Nelson Mandela'}, 'entity': {'id': '808677941294968833', 'name': 'Ryan Seacrest', 'description': 'Ryan Seacrest'}}, {'domain': {'id': '45', 'name': 'Brand Vertical', 'description': 'Top level entities that describe a Brands industry'}, 'entity': {'id': '781974596148793345', 'name': 'Business & finance'}}, {'domain': {'id': '30', 'name': 'Entities [Entity Service]', 'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'}, 'entity': {'id': '781974596807368705', 'name': 'Credit Cards - Business & finance'}}, {'domain': {'id': '47', 'name': 'Brand', 'description': 'Brands and Companies'}, 'entity': {'id': '10026414822', 'name': 'Capital One'}}, {'domain': {'id': '58', 'name': 'Entertainment Personality', 'description': 'An entertainment personality in the world, like Anderson Cooper or Miranda Sings'}, 'entity': {'id': '808677941294968833', 'name': 'Ryan Seacrest', 'description': 'Ryan Seacrest'}}]".replace("'", '"'))

[{'domain': {'id': '10',
   'name': 'Person',
   'description': 'Named people in the world like Nelson Mandela'},
  'entity': {'id': '808677941294968833',
   'name': 'Ryan Seacrest',
   'description': 'Ryan Seacrest'}},
 {'domain': {'id': '45',
   'name': 'Brand Vertical',
   'description': 'Top level entities that describe a Brands industry'},
  'entity': {'id': '781974596148793345', 'name': 'Business & finance'}},
 {'domain': {'id': '30',
   'name': 'Entities [Entity Service]',
   'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'},
  'entity': {'id': '781974596807368705',
   'name': 'Credit Cards - Business & finance'}},
 {'domain': {'id': '47',
   'name': 'Brand',
   'description': 'Brands and Companies'},
  'entity': {'id': '10026414822', 'name': 'Capital One'}},
 {'domain': {'id': '58',
   'name': 'Entertainment Personality',
   'description': 'An entertainment personality in the world, like Anderson Cooper or Miranda S

In [None]:
tweets = pd.read_pickle('../dataset/processed_data/master_fnn.pkl')

In [None]:
tweets['context_annotations']

0          [{'domain': {'id': '10', 'name': 'Person', 'de...
1          [{'domain': {'id': '3', 'name': 'TV Shows', 'd...
2          [{'domain': {'id': '3', 'name': 'TV Shows', 'd...
3          [{'domain': {'id': '10', 'name': 'Person', 'de...
4          [{'domain': {'id': '10', 'name': 'Person', 'de...
                                 ...                        
1434570    [{'domain': {'id': '3', 'name': 'TV Shows', 'd...
1434571                                                 None
1434572    [{'domain': {'id': '35', 'name': 'Politician',...
1434573    [{'domain': {'id': '3', 'name': 'TV Shows', 'd...
1434574                                                 None
Name: context_annotations, Length: 1434575, dtype: object

In [None]:
tmp = tweets[tweets['context_annotations'] != None][['id', 'context_annotations']].explode('context_annotations').reset_index(drop=True)
tmp

Unnamed: 0,id,context_annotations
0,1029123395739414528,"{'domain': {'id': '10', 'name': 'Person', 'des..."
1,1029123395739414528,"{'domain': {'id': '45', 'name': 'Brand Vertica..."
2,1029123395739414528,"{'domain': {'id': '30', 'name': 'Entities [Ent..."
3,1029123395739414528,"{'domain': {'id': '47', 'name': 'Brand', 'desc..."
4,1029123395739414528,"{'domain': {'id': '58', 'name': 'Entertainment..."
...,...,...
4547784,1021114709674725376,"{'domain': {'id': '26', 'name': 'Sports League..."
4547785,1021114709674725376,"{'domain': {'id': '60', 'name': 'Athlete', 'de..."
4547786,1021114709674725376,"{'domain': {'id': '69', 'name': 'News Vertical..."
4547787,1021114709674725376,"{'domain': {'id': '26', 'name': 'Sports League..."


In [None]:
context_annotations = pl.from_pandas(tmp)

In [140]:
context_annotations_extracted = tweets_pl.select([
    pl.col('id').alias('tweet_id'),
    'context_annotations'
]).explode('context_annotations').unnest('context_annotations').unnest('domain').select([
    pl.col('tweet_id'),
    pl.col('description').alias('domain_description'),
    pl.col('id').alias('domain_id'),
    pl.col('name').alias('domain_name'),
    pl.col('entity'),
]).unnest('entity').select([
    pl.col('tweet_id'),
    pl.col('domain_description'),
    pl.col('domain_id').cast(pl.Int64),
    pl.col('domain_name'),
    pl.col('description').alias('entity_description'),
    pl.col('id').alias('entity_id').cast(pl.Int64),
    pl.col('name').alias('entity_name'),
])

context_annotations_extracted

tweet_id,domain_description,domain_id,domain_name,entity_description,entity_id,entity_name
i64,str,i64,str,str,i64,str
1029123395739414528,"""Named people i...",10,"""Person""","""Ryan Seacrest""",808677941294968833,"""Ryan Seacrest"""
1029123395739414528,"""Top level enti...",45,"""Brand Vertical...",,781974596148793345,"""Business & fin..."
1029123395739414528,"""Entity Service...",30,"""Entities [Enti...",,781974596807368705,"""Credit Cards -..."
1029123395739414528,"""Brands and Com...",47,"""Brand""",,10026414822,"""Capital One"""
1029123395739414528,"""An entertainme...",58,"""Entertainment ...","""Ryan Seacrest""",808677941294968833,"""Ryan Seacrest"""
998353516434518016,"""Television sho...",3,"""TV Shows""","""Judges Luke Br...",10001083292,"""American Idol"""
998353516434518016,"""Television sho...",4,"""TV Episodes""",,10053565198,"""Performance Fi..."
1051158211208736768,"""Television sho...",3,"""TV Shows""","""Life with the ...",10000283194,"""Keeping Up Wit..."
1051158211208736768,"""Television sho...",3,"""TV Shows""",,10035545983,"""Keeping Up Wit..."
1051158211208736768,"""Television sho...",3,"""TV Shows""",,10041386809,"""Keeping Up Wit..."


In [20]:
context_annotations_dict = context_annotations_extracted.to_dicts()