In [1]:
# # install all dependencies as following

# !pip install spacy
# !python -m spacy download en_core_web_sm

# !pip install git+https://github.com/JustAnotherArchivist/snscrape.git
# import snscrape.modules.twitter as sntwitter
# #!pip install pymongo[srv]
#!
# # !conda install git

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
import json, csv, uuid, config, os, logging, re
import pymongo
import subprocess
#import pandas as pd
import numpy as np
import spacy
from IPython.display import display_javascript, display_html, display
from datetime import datetime, date, time, timedelta
from pathlib import Path
from pprint import pprint
from collections import Counter
import snscrape.modules.twitter as sntwitter
from pandas_profiling import ProfileReport

In [4]:
nlp = spacy.load('en_core_web_sm')

In [21]:
logging.basicConfig(filename='logs',
                            filemode='a',
                            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S',
                            level=logging.DEBUG)

data_folder = Path("data/")
covid_tweets_files= data_folder / "covid_tweets.json"
fmt = '%Y-%m-%d %H:%M:%S'

phrases = ['covid-19 mental health',
 'depression covid-19',
 'Covid Mental Health students',
 'students coronavirus',
 'college mental health covid',
 'isolation covid students',
 'isolation coronavirus',
 'students depression covid-19',
 'students depression covid']

since= (datetime.now()+timedelta(days=-360))
until = datetime.now()
max_results = 1

filter_list = ['id',
               'content',
               'coordinates_longitude',
               'coordinates_latitude'
               'user_username',
               'user_displayname',
               'user_verified',
               'user_followersCount',
               'date',
               'user_created',
               'likeCount',
               'retweetCount',
               'url',
               'retweetedTweet',
               'replyCount'
              ]

DB_NAME = 'Covid_Mental_Health_V2'
RAW_DATA_COLLECTION_NAME = 'RAW_DATA'
PROCESSD_DATA_COLLECTION_NAME = 'PROCESSD_DATA'

In [6]:
client = pymongo.MongoClient(config.Mongo_Address)
mydb = client[DB_NAME]

In [7]:
def flatten_dict(dd, separator='_', prefix=''):
    
    flat_dict = { prefix + separator + k if prefix else k : v
             for kk, vv in dd.items()
             for k, v in flatten_dict(vv, separator, kk).items()
             } if isinstance(dd, dict) else { prefix : dd }
    
    return  flat_dict

In [8]:
def filter_tweet(tweet,fields):
    
    t = flatten_dict(tweet)
    t = { key: t[key] for key in fields if key in t.keys() }
    
    return t

In [9]:
def remove_links_from_tweet_content(tweet):
    
    new_content = re.sub(r'https\S+', '', tweet['content'])
    new_content = re.sub(r'@\S+',"",new_content)
    new_content = re.sub(r'RT',"",new_content)
    
    tweet['content'] = new_content
    
    return tweet

In [10]:
def split_content_into_sents(content):
    
    sents = [i.text for i in nlp(content).sents]
    
    return sents

In [11]:
def word_freq(content):
    
    word_freq = Counter([token.text for token in nlp(content) if not token.is_stop and not token.is_punct and token.text != '$'])
    word_freq = dict(word_freq)
    
    return word_freq

In [12]:
def common_words(content,most_common=5):
    
    common_words = Counter([token.text for token in nlp(content) if not token.is_stop and not token.is_punct and token.text != '$']).most_common(most_common)
    common_words = dict(common_words)
    
    return common_words

In [13]:
def process_tweet(tweet):
    
    processed__tweet = tweet
    processed__tweet = filter_tweet(processed__tweet,filter_list)
    processed__tweet = remove_links_from_tweet_content(processed__tweet)
    processed__tweet['sents'] = split_content_into_sents(processed__tweet['content'])    
    processed__tweet['word_freq'] = word_freq(processed__tweet['content'])
    processed__tweet['common_words'] = common_words(processed__tweet['content'],5)
    
    return processed__tweet

In [14]:
def insert_tweet_to_database(tweet, collection):
    
    mydb[collection].replace_one( {"id":tweet["id"]}, tweet ,upsert= True)
    
    return 

In [15]:
def load_tweets( query, since= (datetime.now()+timedelta(days=-7)), until=datetime.now(), lang= 'en', max_results = 1 , collection=RAW_DATA_COLLECTION_NAME ):
    

    
    since = since.strftime('%Y-%m-%d')
    until = until.strftime('%Y-%m-%d')
    
    q = f'{query} since:{since} until:{until} lang:{lang}'
    
    count = 0
    
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(q).get_items()):
        if i >= max_results:
            break
    
        tw = {
            "id" : None,
            "url" : None,
            "date" : None,
            "content" : None,
            "renderedContent" : None,
            "user" : {
                "username" : None, 
                "id" : None, 
                "displayname" : None, 
                "description" : None, 
                "rawDescription" : None, 
                "descriptionUrls" : None, 
                "verified" : None, 
                "created" : None, 
                "followersCount" : None, 
                "friendsCount" : None, 
                "statusesCount" : None, 
                "favouritesCount" : None, 
                "listedCount" : None, 
                "mediaCount" : None, 
                "location" : None, 
                "protected" : None, 
                "linkUrl" : None, 
                "linkTcourl" : None, 
                "profileImageUrl" : None, 
                "profileBannerUrl" : None
                },
            "replyCount" : None, 
            "retweetCount" : None, 
            "likeCount" : None, 
            "quoteCount" : None, 
            "conversationId" : None, 
            "lang" : None, 
            "source" : None, 
            "sourceUrl" : None, 
            "sourceLabel" : None, 
            "outlinks" : None, 
            "tcooutlinks" : None, 
            "retweetedTweet" : None, 
            "inReplyToTweetId" : None, 
            "coordinates" : {
                "longitude" : None, 
                "latitude" : None
            },
            "place" : {
                "fullName" : None, 
                "name" : None, 
                "type" : None, 
                "country" : None, 
                "countryCode" : None
            }, 
            "hashtags" : None, 
            "cashtags" : None
        }
        
        tw['id'] = tweet.id
        tw['url'] = tweet.url
        tw['date'] = tweet.date
        tw['content'] = tweet.content
        tw['renderedContent'] = tweet.renderedContent
        tw['user']['username'] = tweet.user.username
        tw['user']['id'] = tweet.user.id
        tw['user']['displayname'] = tweet.user.displayname
        tw['user']['description'] = tweet.user.description
        tw['user']['rawDescription'] = tweet.user.rawDescription
        tw['user']['descriptionUrls'] = tweet.user.descriptionUrls
        tw['user']['verified'] = tweet.user.verified
        tw['user']['created'] = tweet.user.created
        tw['user']['followersCount'] = tweet.user.followersCount
        tw['user']['friendsCount'] = tweet.user.friendsCount
        tw['user']['statusesCount'] = tweet.user.statusesCount
        tw['user']['favouritesCount'] = tweet.user.favouritesCount
        tw['user']['listedCount'] = tweet.user.listedCount
        tw['user']['mediaCount'] = tweet.user.mediaCount
        tw['user']['location'] = tweet.user.location
        tw['user']['protected'] = tweet.user.protected
        tw['user']['linkUrl'] = tweet.user.linkUrl
        tw['user']['linkTcourl'] = tweet.user.linkTcourl
        tw['user']['profileImageUrl'] = tweet.user.profileImageUrl
        tw['user']['profileBannerUrl'] = tweet.user.profileBannerUrl
        tw['replyCount'] = tweet.replyCount
        tw['retweetCount'] = tweet.retweetCount
        tw['likeCount'] = tweet.likeCount
        tw['quoteCount'] = tweet.quoteCount
        tw['conversationId'] = tweet.conversationId
        tw['lang'] = tweet.lang
        tw['source'] = tweet.source
        tw['sourceUrl'] = tweet.sourceUrl
        tw['sourceLabel'] = tweet.sourceLabel
        tw['outlinks'] = tweet.outlinks
        tw['tcooutlinks'] = tweet.tcooutlinks
        tw['retweetedTweet'] = tweet.retweetedTweet
        tw['inReplyToTweetId'] = tweet.inReplyToTweetId
        if tweet.coordinates:
            tw['coordinates']['longitude'] = tweet.coordinates.longitude
            tw['coordinates']['latitude'] = tweet.coordinates.latitude
        if tweet.place:
            tw['place']['fullName'] = tweet.place.fullName
            tw['place']['name'] = tweet.place.name
            tw['place']['type'] = tweet.place.type
            tw['place']['country'] = tweet.place.country
            tw['place']['countryCode'] = tweet.place.countryCode
        tw['hashtags'] = tweet.hashtags
        tw['cashtags'] = tweet.cashtags   
        
        
        
        p_tw = process_tweet(tw)

        insert_tweet_to_database(tw, collection)
        insert_tweet_to_database(p_tw, PROCESSD_DATA_COLLECTION_NAME)
        

        count+=1
        
        
    return count

In [2]:
p_start_time = datetime.now()
for phrase in phrases:
    
    start_time = datetime.now()
    
    print(f'-------   Start loading {phrase} tweets   -------')
    logging.info(f'-------   Start loading {phrase} tweets   -------')
    
    l = load_tweets(phrase, since=since, until=until, max_results=max_results)
    
    print(f'-------   End loading {phrase} tweets   -------')
    logging.info(f'-------   End loading {phrase} tweets   -------')
    
    end_time = datetime.now()
    loading_time = divmod((end_time-start_time).seconds, 60) 
    
    print(f'Loading {phrase} tweets done in {loading_time[0]} minutes {loading_time[1]} seconds\n\n\n')
    logging.info(f'Loading{phrase} tweets done in {loading_time[0]} minutes {loading_time[1]} seconds')

p_end_time = datetime.now()
p_loading_time = divmod((p_end_time-p_start_time).seconds, 60) 

print(f'Loading all {len(phrases)} phrases tweets doen in {p_loading_time[0]} minutes {p_loading_time[1]} seconds :)')
logging.info(f'Loading all {len(phrases)} phrases tweets doen in {p_loading_time[0]} minutes {p_loading_time[1]} seconds :)')

In [1]:
import pandas as pd
df = pd.DataFrame(list(mydb[PROCESSD_DATA_COLLECTION_NAME].find()))
df

In [None]:
raw_df = pd.DataFrame(list(mydb[RAW_DATA_COLLECTION_NAME].find()))
raw_df

In [None]:
df.to_csv("./data_distribution/PROCESSED_DATA.csv",index=False)
raw_df.to_csv("./data_distribution/RAW_DATA.csv",index=False)

In [None]:
profile = ProfileReport(df)

In [None]:
profile

In [None]:
profile.to_file("data_profile_report.html")