# Mini-project I: Twitter & Neo4j
## Team ID: 3

- Στοϊκοπούλου Ελεονώρα 126
- Γεροχρήστου Μαργαρίτα 150
- Προκοπίου Ιωάννης 132

In [None]:
import json
from bson.json_util import dumps
from pprintpp import pprint
import pandas as pd
from typing import NoReturn

In [None]:
def create_data_dict() -> dict:
    """Reads a MongoDB json file and returns a python dictionary

    Returns:
        dict: A dictionary of dictionaries. 
            The dictionaries represent the MongoDB documents.
    """
    # Open the JSON file from MongoDB (creates a list of json-like strings)
    with open('WDM1_2.json') as f:
        data = json.loads(dumps(f))

    # Create a dictionary where each key is the document id
    data_dict = {}

    for item in data:
        item_dict = json.loads(item)
        name = item_dict['_id']['$oid']
        data_dict[name] = item_dict

    return data_dict

In [None]:
def create_tweets_csvs(data_dict: pd.DataFrame()) -> NoReturn:
    """Create the tweet nodes CSV as well as the tweeted and retweeted relationship CSVs

    tweeted -> user - tweet relationship
    retweeted -> user - tweet relationship

    Args:
        data_dict (pd.DataFrame): the data dictionary with all the documents
    """
    # Create the pandas dataframes
    tweets = pd.DataFrame(columns=['tweet_id', 'type', 'creation_date'])
    tweeted = pd.DataFrame(columns=['user_id', 'tweet_id', 'tweet_creation_date'])
    retweeted = pd.DataFrame(columns=['user_id', 'tweet_id', 'tweet_creation_date'])

    # Loop all the documents in the dictionary
    for key in data_dict:
        try:
            tweet_id = data_dict[key]['includes']['tweets'][0]['id']
            type = data_dict[key]['data']['referenced_tweets'][0]['type']
            date = data_dict[key]['includes']['tweets'][0]['created_at_converted']['$date']

            # Add a record in the dataframe for the tweets that
            # have a 'referenced_tweet'

            tweets.loc[len(tweets)] = [tweet_id, type, date]

            # We check if the case is for a retweet and keep the records for the retweeted relationships
            if type == 'retweeted':
                user_id = data_dict[key]['includes']['users'][0]['id']
                tweet_creation_date = data_dict[key]['data']['created_at']

                retweeted.loc[len(retweeted)] = [user_id, tweet_id, tweet_creation_date]

        except KeyError:
            # If the field referenced_tweets is not available then we get a KeyError
            # and it is an original tweet and the type is set to "tweet"

            tweet_id = data_dict[key]['includes']['tweets'][0]['id']
            type = 'tweet'
            date = data_dict[key]['includes']['tweets'][0]['created_at_converted']['$date']

            tweets.loc[len(tweets)] = [tweet_id, type, date]

            # In this case we have tweets so we keep the records for the tweeted relationships
            user_id = data_dict[key]['includes']['users'][0]['id']
            tweet_creation_date = data_dict[key]['data']['created_at']
            tweeted.loc[len(tweeted)] = [user_id, tweet_id, tweet_creation_date]


    tweets['creation_date'] = pd.to_datetime(tweets['creation_date'])
    tweets['date'] = tweets['creation_date'].dt.date

    # Save the dataframe as CSV
    tweets.to_csv('tweets.csv', index=False)
    tweeted.to_csv('tweeted.csv', index=False)
    retweeted.to_csv('retweeted.csv', index=False)


In [None]:
def create_users_csv(data_dict):
    """Create the user nodes CSV 

    Args:
        data_dict (pd.DataFrame): the data dictionary with all the documents
    """
    # Create a pandas dataframe
    users = pd.DataFrame(columns=['user_id', 'following', 'followers', 'tweet_created_at'])

    # Loop all the documents in the dictionary
    for key in data_dict:
        try:
            # Keep all the necessary fields
            user_id = data_dict[key]['includes']['users'][0]['id']
            following = data_dict[key]['includes']['users'][0]['public_metrics']['following_count']
            followers = data_dict[key]['includes']['users'][0]['public_metrics']['followers_count']
            tweet_created_at = data_dict[key]['data']['created_at']

            # Add record to the dataframe
            users.loc[len(users)] = [user_id, following,followers, tweet_created_at]

        except KeyError:
            pass

    # Convert the datetime string to datetime and sort from newest to oldest date
    users['tweet_created_at'] = pd.to_datetime(users['tweet_created_at'], format="%Y-%m-%dT%H:%M:%S.%fZ")
    users.sort_values(by='tweet_created_at', ascending=False, inplace=True)

    # Drop duplicate user ids but keep the latest record to have the latest possible following/followers count
    users.drop_duplicates(subset='user_id', keep='first', inplace=True)

    # Drop tweet created time column
    users = users.drop(columns='tweet_created_at')

    # Print and save the dataframe as CSV
    users.to_csv('users.csv', index=False)

In [None]:
def create_url_hashtag_csvs(data_dict):
    """Create the URL and Hashtags nodes CSVs as well as the has_url, used_url,
    has_hashtag and used_hashtag relationship CSVs

    has_url, has_hashtah -> tweet - entity relationship
    used_url, used_hashtah -> user - entity relationship

    Args:
        data_dict (pd.DataFrame): the data dictionary with all the documents
    """
    # Create the pandas dataframes
    urls = pd.DataFrame(columns=['url'])
    has_url = pd.DataFrame(columns=['tweet_id', 'url'])
    used_url = pd.DataFrame(columns=['user_id', 'url'])

    hashtags = pd.DataFrame(columns=['hashtag'])
    has_hashtag = pd.DataFrame(columns=['tweet_id', 'hashtag'])
    used_hashtag = pd.DataFrame(columns=['user_id', 'hashtag'])

    # Loop all the documents in the dictionary
    for key in data_dict:
        try:
            # Keep the entities for the first tweet since it's the one our 
            # document is refering to and we are going to create a record in the tweets.csv
            
            tweet_entities = data_dict[key]['includes']['tweets'][0]['entities']
            
            # Get all the URLs from the entities of the first tweet
            if 'urls' in tweet_entities:
                for url in tweet_entities['urls']:

                    url_str = url['expanded_url']

                    urls.loc[len(urls)] = [url_str]
                    has_url.loc[len(has_url)] = [data_dict[key]['includes']['tweets'][0]['id'], url_str]
                    used_url.loc[len(used_url)] = [data_dict[key]['includes']['users'][0]['id'], url_str]

            # Get all the hashtags from the entities of the first tweet
            if 'hashtags' in tweet_entities:
                for hashtag in tweet_entities['hashtags']:

                    hashtag_str = hashtag['tag'].str.lower.str.strip()

                    hashtags.loc[len(hashtags)] = [hashtag_str]
                    has_hashtag.loc[len(has_hashtag)] = [data_dict[key]['includes']['tweets'][0]['id'], hashtag_str]
                    used_hashtag.loc[len(used_hashtag)] = [data_dict[key]['includes']['users'][0]['id'], hashtag_str]
        except KeyError:
            pass

    # Deduplicate the dataframes
    urls.drop_duplicates(subset=['url'], inplace=True)
    has_url.drop_duplicates(inplace=True)
    used_url.drop_duplicates(inplace=True)

    hashtags.drop_duplicates(subset=['hashtag'], inplace=True)
    has_hashtag.drop_duplicates(inplace=True)
    used_hashtag.drop_duplicates(inplace=True)

    # Save the dataframes as CSVs
    urls.to_csv('urls.csv', index=False)
    has_url.to_csv('has_url.csv', index=False)
    used_url.to_csv('used_url.csv', index=False)

    hashtags.to_csv('hashtags.csv', index=False)
    has_hashtag.to_csv('has_hashtag.csv', index=False)
    used_hashtag.to_csv('used_hashtag.csv', index=False)

In [None]:
def mentioned_csv(data_dict):
    """Create the mentioned and replied_to relationship CSVs

    mentioned -> user - user relationship
    replied_to -> user - user relationship

    Args:
        data_dict (pd.DataFrame): the data dictionary with all the documents
    """

    # Create the pandas dataframes
    mentioned = pd.DataFrame(columns=['source_user_id', 'target_user_id', 'tweet_id'])
    replied_to = pd.DataFrame(columns=['source_user_id', 'target_user_id'])

    # Loop all the documents in the dictionary
    for key in data_dict:
        try:

            # Extract the source user ID
            source_user_id = data_dict[key]['includes']['users'][0]['id']

            if 'referenced_tweets' in data_dict[key]['data']:
                
                # In the case of retweets we keep the author of tweet[1] 
                # which is the original tweet, and add to the mentioned file since
                # we want to keep it in the mention-network even if it doesn't exist in the 
                # mentions entities of the tweet
                if data_dict[key]['data']['referenced_tweets'][0]['type'] == 'retweeted':

                    target_user_id = data_dict[key]['includes']['tweets'][1]['author_id']
                    mentioned.loc[len(mentioned)] = [source_user_id, target_user_id, data_dict[key]['includes']['tweets'][0]['id']]
                    continue
                
                # In the case of a reply we found out that the original tweet
                # author could be either under tweets[1]['author_id']
                # or ['users'][1]['id'] but there were cases that these fields were missing
                elif data_dict[key]['data']['referenced_tweets'][0]['type'] == 'replied_to':
                    try:
                        target_user_id = data_dict[key]['includes']['tweets'][1]['author_id']
                        replied_to.loc[len(replied_to)] = [source_user_id, target_user_id]
                    except IndexError:
                        try:
                            target_user_id = data_dict[key]['includes']['users'][1]['id']
                            replied_to.loc[len(replied_to)] = [source_user_id, target_user_id]
                        except IndexError:
                            pass

            # Extract mention entities for all the non-retweets 
            for mention in data_dict[key]['includes']['tweets'][0]['entities']['mentions']:
                mentioned.loc[len(mentioned)] = [source_user_id, mention['id'], data_dict[key]['includes']['tweets'][0]['id']]

        except KeyError:
            pass

    mentioned.drop_duplicates(inplace=True)
    replied_to.drop_duplicates(inplace=True)

    # Save the dataframe as CSV
    mentioned.to_csv('mentioned.csv', index=False)
    replied_to.to_csv('replied_to.csv', index=False)

In [None]:
data_dict = create_data_dict()
create_tweets_csvs(data_dict)
create_users_csv(data_dict)
create_url_hashtag_csvs(data_dict)
mentioned_csv(data_dict)