# Iranian Tweet EDA and Topic Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

### Data Filtering

In [None]:
data = pd.read_csv('data/iranian_tweets.csv')

In [None]:
data.info()

In [None]:
df = data[['userid','tweet_language','tweet_text','is_retweet','hashtags']]

In [None]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

### Tokenize & Lemmatize, Remove Punctuation

In [None]:
# Tfidf vectorizer takes care of stop words; it's on us to remove links, punctuation (can include in token pattern?)

In [None]:
punc = set(string.punctuation+'‘’…°–—“”')
lem = nltk.stem.WordNetLemmatizer()
twt = TweetTokenizer(reduce_len=True, strip_handles=True)

def tweet_tokenize_full(tweet):
    tokens = twt.tokenize(tweet)
    no_punc = [token for token in tokens if (token[0] not in punc) and (len(token)>2)]
    no_links = [token for token in no_punc if token[0:4]!='http']
    lemmatized = [lem.lemmatize(token) for token in no_links]
    return lemmatized

In [None]:
num_feats = 1000
ngrams = (1,1)
vctr = TfidfVectorizer(analyzer='word',
                       stop_words='english',
                       tokenizer=tweet_tokenize_full,
                       max_features=num_feats,
                       ngram_range=ngrams)

In [None]:
X = vctr.fit_transform(df.tweet_text)

In [None]:
tfidf_vals = X.toarray()

In [None]:
feature_names = np.array(vctr.get_feature_names())

In [None]:
tfidf_results = pd.DataFrame(data = tfidf_vals, columns=feature_names)

In [None]:
n_components = 5
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, solver='mu', max_iter=1000, l1_ratio=0.5).fit(X)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #{0}: \n{1}\n".format(topic_idx, top_words))
    print()

In [None]:
n_top_words = 15
print_top_words(nmf, feature_names, n_top_words)

## Russia Time

In [28]:
rus_df = pd.read_csv('data/ira_tweets.csv',nrows=200000)

In [None]:
rus_df.sample(10)

In [None]:
list(rus_df.columns)

### User Following Analysis

In [None]:
rus_users_grouper = rus_df.groupby('userid')
rus_users = rus_users_grouper.mean()
rus_users

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(rus_users['following_count'],rus_users['follower_count'],alpha=0.5)
plt.xlim(0,150000)
plt.ylim(0,150000)

In [None]:
rus_df_filt = rus_df[(rus_df.is_retweet==False)&(rus_df.tweet_language=='en')]
rus_df_filt = rus_df_filt[['userid','tweet_text','is_retweet','hashtags','urls']]

rus_df_filt.sample(10)

In [None]:
rus_df_filt.loc[80841]['tweet_text']

In [None]:
rus_df_filt.sample(100)

In [None]:
rus_df_filt.fillna('',inplace=True)
grouper = rus_df_filt.groupby('userid').agg(list)
list(grouper.tweet_text.values)[100:200]

### Observations / patterns
Specific
* Some accounts almost exclusively retweet; need to compare to regular users
* Frequent link sharing (proxy for retweeting, possibly optimizing follower growth)
* Short, pithy quotes, often ending in - or ~ and quote attribution

General
* Hashtags that are offtopic for tweets (large distance between tweet content in 'tweet2vec' space from 'hashtag2vec', aggregated by user account)
* Tweets that are off topic from each other (large distance in net 'tweet2vec' space between user's tweets; counts for retweets as well)
* Tweets with inconsistent grammatical structure or vocabulary (different function word usage between tweets by user)
* Spamlike hashtag or linking behavior between tweets (tfidf shows high term frequency for link and hashtag strings across user's tweet corpus)

### EDA on political tweets dataset

In [None]:
political_tweets = pd.read_csv('data/political_social_media.csv', encoding = "ISO-8859-1")

In [None]:
list(political_tweets.columns)

In [None]:
list(political_tweets.sample(100).text.values)

### Get Normie Tweets (May 8 2018 snapshot)

In [None]:
test_df = pd.read_json('data/twitter-2018-05-08/2018/05/08/01/00.json', lines=True)

In [2]:
def get_normie_tweets():
    subfolders = ['00','01','02','03']
    path = 'data/twitter-2018-05-08/2018/05/08'
    tweets_df = pd.DataFrame()
    
    for folder in subfolders:
        for i in range(0,10):
            try:
                temp_df = pd.read_json('{0}/{1}/0{2}.json'.format(path,folder,i), lines=True)
                temp_df = temp_df[temp_df['lang']=='en']
                tweets_df = pd.concat([tweets_df,temp_df],sort=True)
            except:
                continue
        for i in range(10,60):
            try:
                temp_df = pd.read_json('{0}/{1}/{2}.json'.format(path,folder,i), lines=True)
                temp_df = temp_df[temp_df['lang']=='en']
                tweets_df = pd.concat([tweets_df,temp_df],sort=True)  
            except:
                continue
    
    return tweets_df

In [3]:
eng_tweets = get_normie_tweets()

In [None]:
tweets_df = pd.DataFrame()
for i in range(0,10):
    temp_df = pd.read_json('data/twitter-2018-05-08/2018/05/08/01/0{}.json'.format(i), lines=True)
    tweets_df = pd.concat([tweets_df,temp_df],sort=True)
for i in range(10,58):
    temp_df = pd.read_json('data/twitter-2018-05-08/2018/05/08/01/{}.json'.format(i), lines=True)
    tweets_df = pd.concat([tweets_df,temp_df],sort=True)  
tweets_df.info()

### Notes on user accounts

real people user account data (example):
* 'id'
*  'name': 'Claptrap',
*  'screen_name': 'ECHOcasts_ebook',
*  'location': None,
*  'description': 'I can see… the code',
*  'url': None,
*  'followers_count': 35,
*  'friends_count': 14,
*  'statuses_count': 16506,
*  'created_at': 'Sat Mar 19 17:12:51 +0000 2016',
*  'lang': 'fr',}

comparable IRA account data:
* 'userid',
* 'user_display_name',
* 'user_screen_name',
* 'user_reported_location',
* 'user_profile_description',
* 'user_profile_url',
* 'follower_count',
* 'following_count',
* <<< impute statuses count >>>
* 'account_creation_date',
* 'account_language',

In [18]:
list(eng_tweets['user'].values[:10])

[{'id': 2572835998,
  'id_str': '2572835998',
  'name': 'Bright Opportunities',
  'screen_name': 'OppBright',
  'location': 'Buxton Derbyshire ',
  'url': 'http://www.brightopportunities.org.uk',
  'description': None,
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 158,
  'friends_count': 604,
  'listed_count': 1,
  'favourites_count': 202,
  'statuses_count': 3315,
  'created_at': 'Fri May 30 20:43:21 +0000 2014',
  'utc_offset': None,
  'time_zone': None,
  'geo_enabled': False,
  'lang': 'en',
  'contributors_enabled': False,
  'is_translator': False,
  'profile_background_color': 'C0DEED',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_tile': False,
  'profile_link_color': '1DA1F2',
  'profile_sidebar_border_color': 'C0DEED',
  'profile_sidebar_fill_color': 'DDEEF6',
  'profile_text_co

In [None]:
def get_user_data(row):
    row['userid'] = row['user']['id']
    row['follower_count'] = row['user']['followers_count']
    row['following_count'] = row['user']['friends_count']
    return row
    
eng_users = eng_tweets.apply(get_user_data, axis=1)
eng_users.sample(10)

In [None]:
eng_users.info()

In [None]:
eng_users_grouper = eng_users[['userid','follower_count','following_count','reply_count','retweeted','quote_count']].groupby('userid')
eng_users_grouped = eng_users_grouper.mean()

In [None]:
fig, axs = plt.subplots(1,2, sharey=True)
ax1 = axs[0]
ax2 = axs[1]
fig.set_figheight(12)
fig.set_figwidth(24)
ax1.scatter(eng_users_grouped['following_count'],eng_users_grouped['follower_count'],alpha=0.1)
ax2.scatter(rus_users['following_count'],rus_users['follower_count'],alpha=0.2, color='r')
ax1.set_xlim(0,5000)
ax2.set_xlim(0,5000)
ax1.set_ylim(0,5000)

### Define follower ratio

In [None]:
eng_users_grouped['follower_ratio'] = eng_users_grouped['follower_count']/(eng_users_grouped['following_count']+1)
rus_users['follower_ratio'] = rus_users['follower_count']/(rus_users['following_count']+1)

In [None]:
fig, axs = plt.subplots(1,2)
ax1 = axs[0]
ax2 = axs[1]
fig.set_figheight(12)
fig.set_figwidth(24)
ax1.hist(eng_users_grouped['follower_ratio'],bins=[0,.2,.4,.6,.8,1,1.2,1.4,1.6,1.8,2])
ax2.hist(rus_users['follower_ratio'],bins=[0,.2,.4,.6,.8,1,1.2,1.4,1.6,1.8,2], color='r')
# ax1.set_xscale('log')
# ax2.set_xscale('log')

There is not a significant difference in the distribution of follower:following ratios, except that russians do a bit worse than average.

In [None]:
eng_users_grouped['follower_ratio'].describe()

In [None]:
rus_users['follower_ratio'].describe()

### Add retweet metadata to normal and russian tweets

In [19]:
def add_retweet_metadata(row):
    text = row['text']
    if text[0:4]=='RT @' or ': RT ' in text:
        row['is_retweet'] = 1
        row['is_quote_tweet'] = 0
    elif ' RT ' in text:
        row['is_quote_tweet'] = 1
        row['quote_tweet_text'] = re.split(' RT ',text)[0]
    else:
        row['is_retweet'] = 0
        row['is_quote_tweet'] = 0
    return row

In [20]:
norm_tweets = eng_tweets.apply(add_retweet_metadata,axis=1)

In [24]:
norm_tweets[norm_tweets['is_retweet']==1]['text'].values[0:100]

array(['RT @biticonjustine: WELCOME TO THE CHURCH OF RIHANNA. COME. SHE SHALL BAPTIZE YOU IN BODY LAVA AND SIN. https://t.co/xIvBunLnhO',
       'RT @HectorBellerin: Such an emotional day to play our last game together. You have been a role model on and off the pitch for me and many y…',
       'RT @trackpop_: How can you not love BTS? https://t.co/CHHh70U2te',
       'RT @RandyEBarnett: 7 of 19 were written by liberals. Ancillary list of authors at the end provides many more. https://t.co/jd5S4Ni9T7',
       'RT @MiltShook: This is true. The question is, why do so many on the left go after her? Because she doesn’t always fulfill their cartoon ver…',
       'RT @nctzenunion_usa: [ #OFFICIAL ] 180503 M Countdown with MC #JOHNNY (1)\n@NCTsmtown #NCT \n📸: https://t.co/ov0icvVqwC https://t.co/G6a68HTV…',
       'RT @Priyabh70915309: This 😍😍😘😘❤ #keesh\n#naksh #Keerti \n#yrkkh https://t.co/LiUQAjK7qt',
       "RT @StevieWonder: .@BTS_twt's full dream.  Share your dream &amp; post your own #D

In [36]:
rus_df.rename(mapper={'tweet_text':'text','tweet_language':'lang'},axis=1,inplace=True)
rus_df.columns

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language', 'lang', 'text',
       'tweet_time', 'tweet_client_name', 'in_reply_to_tweetid',
       'in_reply_to_userid', 'quoted_tweet_tweetid', 'is_retweet',
       'retweet_userid', 'retweet_tweetid', 'latitude', 'longitude',
       'quote_count', 'reply_count', 'like_count', 'retweet_count', 'hashtags',
       'urls', 'user_mentions', 'poll_choices'],
      dtype='object')

In [38]:
ira_tweets = rus_df[rus_df.lang=='en'].apply(add_retweet_metadata,axis=1)

In [39]:
ira_tweets[ira_tweets['is_retweet']==1].text.values[0:10]

array(["RT @e933848ee109a3e968bfefb54a785c0426edab39c144c0ace1eb734cde5efed0: Saw it several days ago during a trip and that's amazing! Can you see the face? Who is it? http://t.co/rP1uJgQjOb",
       'RT @MTVNews: SO. DAMN. CUTE. https://t.co/tiUjVi4JrO',
       'RT @Real923LA: A few of our lucky listeners hanging out at the Palladium tonight, waiting for @BigSean tonight! #BigSean #IDecidedTour http…',
       'RT @ReignOfApril: The Immortal Life of #HenriettaLacks starring @Oprah and @ReneeGoldsberry starts NOW on @hbo.',
       'RT @RappersIQ: #Papoose Ft. #RemyMa "Black Love" #Remix #PRO https://t.co/LBcdBBi1Q9 #TBT #Day #Follows #Play #Meet #Fav #Rt',
       "RT @2DopeBoyz: Press play on @knxwledge's WT.PRT10.8 project; with flips of @Drake, @21Savage, @OTGenasis, @kurngb and more: https://t.co/X…",
       'RT @BlackNewsOutlet: Bobby Seale and Huey Newton, the founders of the Black Panther Party, 1966. https://t.co/eyfPzVFB8B',
       'RT @Herobright2: @UNHumanRights NIGERIA CRIES

In [None]:
ira_tweets.shape

In [None]:
eng_tweets.columns

### Put data into bucket

In [None]:
eng_tweets.to_csv('data/2018-05-08_twitter_data_filtered.csv')

In [None]:
# !aws s3 mb s3://alex-john-tweets
# !aws s3 cp data/2018-05-08_twitter_data_filtered.csv s3://alex-john-tweets
# !aws s3 ls alex-john-tweets

### Time series EDA of user tweets

In [None]:
eng_tweets