# Iranian Tweet EDA and Topic Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

### Data Filtering

In [2]:
data = pd.read_csv('data/iranian_tweets.csv')

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122936 entries, 0 to 1122935
Data columns (total 31 columns):
tweetid                     1122936 non-null int64
userid                      1122936 non-null object
user_display_name           1122936 non-null object
user_screen_name            1122936 non-null object
user_reported_location      887669 non-null object
user_profile_description    995845 non-null object
user_profile_url            434954 non-null object
follower_count              1122936 non-null int64
following_count             1122936 non-null int64
account_creation_date       1122936 non-null object
account_language            1122936 non-null object
tweet_language              1117307 non-null object
tweet_text                  1122936 non-null object
tweet_time                  1122936 non-null object
tweet_client_name           1100078 non-null object
in_reply_to_tweetid         339350 non-null float64
in_reply_to_userid          440244 non-null object
quoted_twe

In [3]:
df = data[['userid','tweet_language','tweet_text','is_retweet','hashtags']]

In [4]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

### Tokenize & Lemmatize, Remove Punctuation

In [5]:
# Tfidf vectorizer takes care of stop words; it's on us to remove links, punctuation (can include in token pattern?)

In [63]:
punc = set(string.punctuation+'‘’…°–—“”')
lem = nltk.stem.WordNetLemmatizer()
twt = TweetTokenizer(reduce_len=True, strip_handles=True)

def tweet_tokenize_full(tweet):
    tokens = twt.tokenize(tweet)
    no_punc = [token for token in tokens if (token[0] not in punc) and (len(token)>2)]
    no_links = [token for token in no_punc if token[0:4]!='http']
    lemmatized = [lem.lemmatize(token) for token in no_links]
    return lemmatized

In [94]:
num_feats = 1000
ngrams = (1,1)
vctr = TfidfVectorizer(analyzer='word',
                       stop_words='english',
                       tokenizer=tweet_tokenize_full,
                       max_features=num_feats,
                       ngram_range=ngrams)

In [None]:
X = vctr.fit_transform(df.tweet_text)

In [None]:
tfidf_vals = X.toarray()

In [None]:
feature_names = np.array(vctr.get_feature_names())

In [None]:
tfidf_results = pd.DataFrame(data = tfidf_vals, columns=feature_names)

In [None]:
n_components = 5
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, solver='mu', max_iter=1000, l1_ratio=0.5).fit(X)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #{0}: \n{1}\n".format(topic_idx, top_words))
    print()

In [None]:
n_top_words = 15
print_top_words(nmf, feature_names, n_top_words)

## Russia Time

In [3]:
rus_df = pd.read_csv('data/ira_tweets.csv',nrows=100000)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
rus_df.sample(10)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,account_creation_date,...,latitude,longitude,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,poll_choices
25057,761127697414098944,2532611755,Kathie,KathieMrr,Atlanta,"Imperfection is beauty, madness is genius and ...",,6709,5606,2014-05-29,...,,,0.0,0.0,1.0,0.0,[ThingsToDoInAWaitingRoom],[],,
67000,668764746619953152,e2218e53cd1fc506d06955fb3076e395ae227d634b3207...,e2218e53cd1fc506d06955fb3076e395ae227d634b3207...,e2218e53cd1fc506d06955fb3076e395ae227d634b3207...,Казань,,,354,488,2013-12-26,...,,,0.0,0.0,0.0,0.0,[],[http://vesti.ru/t?2690072],[72525490],
24252,602866978869551105,2570574680,РИА ФАН,riafanru,Россия,Федеральное агентство новостей ВКонтакте: http...,http://t.co/tcKe2jsuqw,15753,6862,2014-06-16,...,,,0.0,0.0,0.0,2.0,[],[http://riafan.ru/289049-novosti-novorossii-s-...,,
54471,531053563620118530,6afabf04a71b1ba2beb3e9b8b2786b7b6a7f6f4019f154...,6afabf04a71b1ba2beb3e9b8b2786b7b6a7f6f4019f154...,6afabf04a71b1ba2beb3e9b8b2786b7b6a7f6f4019f154...,МSK,"Общественный деятель, зам. председателя молоде...",,455,1578,2014-06-08,...,,,0.0,0.0,0.0,0.0,,[http://pavnodiuzov.livejournal.com/84470.html],,
62853,783235979629002752,0945c828c3f3fa4064a6e67cbaa1c15827f6de9f3f9e69...,0945c828c3f3fa4064a6e67cbaa1c15827f6de9f3f9e69...,0945c828c3f3fa4064a6e67cbaa1c15827f6de9f3f9e69...,USA,#hashtagwarrior and your mom knows it!,,3673,4700,2014-05-29,...,,,0.0,0.0,0.0,0.0,[unlikelythingsheardatwalmart],[],,
97299,585532550547898368,1240007161,Замполит,ComradZampolit,Москва (СССР - Россия),"Глава общественной организации ""АгитПолк"" /// ...",https://t.co/W2Cq3uYUL6,48912,1797,2013-03-03,...,,,0.0,0.0,0.0,1.0,[],[],[338960856],
24904,862715488115105794,a95a911dd6ae864c48ed062cdbe75e5c28dbe0cf57c6db...,a95a911dd6ae864c48ed062cdbe75e5c28dbe0cf57c6db...,a95a911dd6ae864c48ed062cdbe75e5c28dbe0cf57c6db...,United States,No more #HappyHolidays shit!!! It's #MerryChri...,https://t.co/XFnhCqCWBy,2748,265,2016-06-15,...,,,0.0,0.0,0.0,0.0,"[MAGA, TrumpTrain, Russia, TrumpsArmy]",[https://twitter.com/uthornsrawk/status/862714...,[2746979823],
62518,488353185380438016,aa185e9a161da2b5a785f70c37b14dcb960b139b1a238d...,aa185e9a161da2b5a785f70c37b14dcb960b139b1a238d...,aa185e9a161da2b5a785f70c37b14dcb960b139b1a238d...,,главное в людях - чувство юмора и наличие пульса,,263,337,2013-12-30,...,,,0.0,0.0,0.0,0.0,,[http://ppogresseo.livejournal.com/68414.html],,
95700,842622087596523520,5cabae0aeac6932ac568a48cac2084dcf57313479366c0...,5cabae0aeac6932ac568a48cac2084dcf57313479366c0...,5cabae0aeac6932ac568a48cac2084dcf57313479366c0...,"Berlin, Deutschland",Lokale Nachrichten aus Berlin. Hier kannst Du ...,,1852,2228,2016-04-27,...,,,0.0,0.0,0.0,0.0,[],[http://www.morgenpost.de/berlin/article209958...,,
55334,805846162305597440,4224729994,Tennessee,TEN_GOP,,Unofficial Twitter of Tennessee Republicans. C...,,147767,74664,2015-11-19,...,,,27.0,27.0,574.0,583.0,[],[],,


In [5]:
rus_df_filt = rus_df[(rus_df.is_retweet==False)&(rus_df.tweet_language=='en')]
rus_df_filt = rus_df_filt[['userid','tweet_text','is_retweet','hashtags','urls']]

rus_df_filt.sample(10)

Unnamed: 0,userid,tweet_text,is_retweet,hashtags,urls
72158,2611151319,"Top high school basketball performers, Jan. 29...",False,[sports],[]
30473,bc1e9ff0868dd5b651e8a25c3cf98ffd3046f3484317ad...,This was less sexy than I had hoped https://t....,False,[],[https://vine.co/v/OYHdWHaeFVl]
15078,bf9dba91bbd72a2145c1e8c620d811f22c416501ec495f...,#FukushimaAgain There`s an emergency situation...,False,[FukushimaAgain],[]
88184,2495567768,Students’ upset about bullying devised plot to...,False,[news],[]
26268,eb717d6e86e610e614145396f4882366370ec4f2a73923...,"Trump Defends Life, Hillary Partial Birth #Abo...",False,"[Abortion, MAGA, PJNET, TCOT]",[http://bit.ly/2eEWfvO]
40182,cc19cd3fba790e5aa198cc54d51c6fc7fa022d16ffdaf8...,VIRAL VIDEO : HILARIOUS PARODY “The Hunt for H...,False,[],[http://viid.me/qw9MVN]
73517,6267a95063fe47b02c35cae310775557ef694ccbdfdcb8...,Petition DEMANDS ARREST of Senator Who Called ...,False,[],[http://ift.tt/2xboA6b]
92006,a7e66c2c8b0ea83b084c62470e9b5a7eabe82ef201c07b...,"Man is not what he thinks he is, he is what he...",False,,
27399,8cd6774724c40801bccd8cfa0e1fe42128aa5f132686ff...,Age is no guarantee of maturity. - Lawana Blac...,False,"[quote, true]",
74937,60f0b63e612aa72961a991ed83ad3318ccaa22f9c343ea...,http://t.co/gOFPdwJ8hV That shit fie and she g...,False,[],[http://www.ImageEra.com/That-shit-fie-and-she...


In [6]:
rus_df_filt.loc[80841]['tweet_text']

"Can't sleep so:   Loving,Beautiful,Can't be replaced #Momin5words #rip"

In [7]:
rus_df_filt.sample(100)

Unnamed: 0,userid,tweet_text,is_retweet,hashtags,urls
36176,2882037326,#IHatePokemonGoBecause they said to me they fo...,False,[IHatePokemonGoBecause],[]
12945,2743327187,#news AAA Receives Record Number of Calls Amid...,False,[news],[]
79567,533bd35fdbbec65d406b13a6a6e128c0604f41a8e9c1bd...,Supreme Court Justice #ObamaNextJob,False,[ObamaNextJob],[]
97256,87a4a47fd06dcadfdd84e02849dd9e059ad3cd2b1cec36...,#chanlePigs are FLYING! CNN Just took Presiden...,False,[chanlePigs],[http://ift.tt/2fmdkQw]
37149,36f2768e6fcac2041cd5604315cab62bc02c808f1ddb34...,"Falling for someone is easy, getting back up a...",False,[],[]
34438,2611151319,American Pharoah ends racing career by romping...,False,[sports],[]
98832,0ba7b20ac6f6508ed2189050961f219b62ab06da5ed1aa...,Birthday selfie ??? via @brisalazar10 http://t...,False,[],[http://www.GoodSelfie.Club/Selfie/2015/06/12/...
91784,034e6c5e16aab6dcb5994b9583ae46c9b6757a8053e3a8...,#top RT ShaneTHazel: https://t.co/v5jBCs2ai0,False,[top],[http://fb.me/820yrrKFM]
93741,9d08c8b245d92ea4fa33bf3ac99f6b7e2e987068f8f7be...,Nothing is so hard for those who abound in ric...,False,,
66353,60f0b63e612aa72961a991ed83ad3318ccaa22f9c343ea...,Emerging Stocks Sink on Greece Fears as China ...,False,[],[http://www.Stocks.Band/RichardGEarl]


In [8]:
rus_df_filt.fillna('',inplace=True)
grouper = rus_df_filt.groupby('userid').agg(list)

In [9]:
list(grouper.tweet_text.values)[100:200]

[["Well, you can't even argue with that! https://t.co/r8YWspvNaN"],
 ['Farsnews https://t.co/LOeU9ESKB2'],
 ['You wanna count my money go to college need a math course',
  'In my dreams I dwell, cause all my dreams are swell'],
 ['young lovers https://t.co/GmJaWzDEIu'],
 ['#Cosby loses bid for quick appeal that could have halted case https://t.co/qGY6fOrTCi https://t.co/AM1H084xb9',
  'Ex-Gitmo detainee connected to Turkey airport attack added ... https://t.co/cSgWYAqrvP | https://t.co/jZiHEHbXig https://t.co/BJlKxG546L',
  'student suing school district, #PD, city for #SexualAbuse by officer who committed suicide https://t.co/CnfMn8ZKvT https://t.co/ZMWQx9Ji6N'],
 ["Smile, because you're beautiful. Laugh, because you're living life to the fullest. Stand strong, because haters can't bring you down."],
 ["Just admit it, loverboy. You can't resist me.",
  'RT @ToriiKat: #MyBestFriendsInFourWords "Let\'s go get VIP"',
  'I hate taking photos! Never look got in pictures!',
  'Big and beaut

### Observations / patterns
Specific
* Spamming "RT", especially at the beginning of tweets
* Frequent link sharing (proxy for retweeting, possibly optimizing follower growth)
* Short, pithy quotes, often ending in - or ~ and quote attribution

General
* Hashtags that are offtopic for tweets (large distance between tweet content in 'tweet2vec' space from 'hashtag2vec', aggregated by user account)
* Tweets that are off topic from each other (large distance in net 'tweet2vec' space between user's tweets)
* Tweets with inconsistent grammatical structure or vocabulary (different function word usage between tweets by user)
* Spamlike hashtag or linking behavior between tweets (tfidf shows high term frequency for link and hashtag strings across user's tweet corpus)