# Iranian Tweet EDA and Topic Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

### Data Filtering

In [2]:
data = pd.read_csv('data/iranian_tweets.csv')

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122936 entries, 0 to 1122935
Data columns (total 31 columns):
tweetid                     1122936 non-null int64
userid                      1122936 non-null object
user_display_name           1122936 non-null object
user_screen_name            1122936 non-null object
user_reported_location      887669 non-null object
user_profile_description    995845 non-null object
user_profile_url            434954 non-null object
follower_count              1122936 non-null int64
following_count             1122936 non-null int64
account_creation_date       1122936 non-null object
account_language            1122936 non-null object
tweet_language              1117307 non-null object
tweet_text                  1122936 non-null object
tweet_time                  1122936 non-null object
tweet_client_name           1100078 non-null object
in_reply_to_tweetid         339350 non-null float64
in_reply_to_userid          440244 non-null object
quoted_twe

In [3]:
df = data[['userid','tweet_language','tweet_text','is_retweet','hashtags']]

In [4]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

### Tokenize & Lemmatize, Remove Punctuation

In [5]:
# Tfidf vectorizer takes care of stop words; it's on us to remove links, punctuation (can include in token pattern?)

In [63]:
punc = set(string.punctuation+'‘’…°–—“”')
lem = nltk.stem.WordNetLemmatizer()
twt = TweetTokenizer(reduce_len=True, strip_handles=True)

def tweet_tokenize_full(tweet):
    tokens = twt.tokenize(tweet)
    no_punc = [token for token in tokens if (token[0] not in punc) and (len(token)>2)]
    no_links = [token for token in no_punc if token[0:4]!='http']
    lemmatized = [lem.lemmatize(token) for token in no_links]
    return lemmatized

In [94]:
num_feats = 1000
ngrams = (1,1)
vctr = TfidfVectorizer(analyzer='word',
                       stop_words='english',
                       tokenizer=tweet_tokenize_full,
                       max_features=num_feats,
                       ngram_range=ngrams)

In [None]:
X = vctr.fit_transform(df.tweet_text)

In [None]:
tfidf_vals = X.toarray()

In [None]:
feature_names = np.array(vctr.get_feature_names())

In [None]:
tfidf_results = pd.DataFrame(data = tfidf_vals, columns=feature_names)

In [None]:
n_components = 5
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, solver='mu', max_iter=1000, l1_ratio=0.5).fit(X)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #{0}: \n{1}\n".format(topic_idx, top_words))
    print()

In [None]:
n_top_words = 15
print_top_words(nmf, feature_names, n_top_words)

## Russia Time

In [5]:
rus_df = pd.read_csv('data/ira_tweets.csv',nrows=100000)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
rus_df.sample(10)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,account_creation_date,...,latitude,longitude,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,poll_choices
32952,525345955189911552,55ba49c2cf8080dde0a69f81ad2f32379439b63f77ca30...,55ba49c2cf8080dde0a69f81ad2f32379439b63f77ca30...,55ba49c2cf8080dde0a69f81ad2f32379439b63f77ca30...,Москва,"Помощник адвоката, специализация-помощь и прав...",,693,1979,2014-03-21,...,,,0.0,0.0,0.0,0.0,,[http://riafan.ru/124346-boeviki-boko-haram-po...,[2570574680],
63532,519388938424041472,c88fefb20617eb1b67091325d6eac28d918014c1711b92...,c88fefb20617eb1b67091325d6eac28d918014c1711b92...,c88fefb20617eb1b67091325d6eac28d918014c1711b92...,Saint Petersburg,"Обожаю США, ненавижу их политику",,842,1934,2013-12-23,...,,,0.0,0.0,0.0,0.0,,,[19503310],
93517,842777552800354308,4508630900,John Davis,TheFoundingSon,"Texas, USA","Business Owner, Proud Father, Conservative, Ch...",,47413,29429,2015-12-09,...,,,1.0,1.0,10.0,19.0,[],[],,
91577,734739925581586432,2882037326,Dana Gold,DanaGeezus,USA,Producer,,24920,12434,2014-10-29,...,,,0.0,0.0,5.0,1.0,[SignsYouAreAmerican],[],,
42392,658340486256529408,2591847731,Boston Today,TodayBostonMA,"Boston, USA","Boston's local news on Twitter. Breaking news,...",,18560,7276,2014-06-08,...,,,0.0,0.0,1.0,0.0,[politics],[],,
4140,564350310895079424,a2593aa1a5e25f019b04902f0ddaf266ba2e9309bf35e9...,a2593aa1a5e25f019b04902f0ddaf266ba2e9309bf35e9...,a2593aa1a5e25f019b04902f0ddaf266ba2e9309bf35e9...,Россия,"Велосипед, встречи с друзьями, интересные филь...",http://t.co/bAvLatOEbn,608,411,2014-03-09,...,,,0.0,0.0,0.0,0.0,[],[],[2665564544],
43331,630882813106110464,3c0574ae1f9eebeb8de679b74de4e21334115741046351...,3c0574ae1f9eebeb8de679b74de4e21334115741046351...,3c0574ae1f9eebeb8de679b74de4e21334115741046351...,Уфа,"фото,катание на хаски",,170,254,2014-05-23,...,,,0.0,0.0,0.0,0.0,[],[],[2518710111],
37238,714925054727815168,2928870434,Newspeak Daily,NewspeakDaily,,it's never too late to reinvent the bicycle,,5816,865,2014-12-13,...,,,0.0,0.0,0.0,0.0,[politics],[],,
48267,883199503486124033,c4c5985c2c4536ead45cd1c3b202391daefcb3fabcf14b...,c4c5985c2c4536ead45cd1c3b202391daefcb3fabcf14b...,c4c5985c2c4536ead45cd1c3b202391daefcb3fabcf14b...,"Stuttgart, Deutschland",Lokale Nachrichten aus Stuttgart. Hier kannst ...,,267,280,2017-04-13,...,,,0.0,0.0,0.0,0.0,[],[http://www.bild.de/regional/stuttgart/brand/u...,,
88051,478842786838364160,95baa7ffd40f6644480e82ccb8eec5cea1421b71e3f721...,95baa7ffd40f6644480e82ccb8eec5cea1421b71e3f721...,95baa7ffd40f6644480e82ccb8eec5cea1421b71e3f721...,Москва,"Иду к своей цели,несмотря ни на что!",,1509,382,2012-08-28,...,,,0.0,0.0,0.0,0.0,,[http://bit.ly/1ycUknn],,


In [23]:
rus_df_filt = rus_df[(rus_df.is_retweet==False)&(rus_df.tweet_language=='en')]
rus_df_filt = rus_df_filt[['userid','tweet_text','is_retweet','hashtags','urls']]

rus_df_filt.sample(10)

Unnamed: 0,userid,tweet_text,is_retweet,hashtags,urls
27232,cc19cd3fba790e5aa198cc54d51c6fc7fa022d16ffdaf8...,Gingrich says Loyal Giuliani would make a bett...,False,[],[http://viid.me/qqcIA3]
30926,2943515140,Afghan envoy says regional Islamic State leade...,False,"[world, news]",[]
92873,87a4a47fd06dcadfdd84e02849dd9e059ad3cd2b1cec36...,https://t.co/mfBjIJYL05 Rewind | There is aff...,False,[HedgeFund],"[http://Covfefe.bz, https://hedgeaccordingly.c..."
62917,2570017414,Growing push for more tax breaks in Haslam roa...,False,[],[http://www.wsmv.com/story/34600176/growing-pu...
62349,2495567768,Massachusetts man allegedly ran over ex-wife 4...,False,[],[http://kron4.com/2017/03/24/prosecutors-massa...
85454,124dbafe1a21b410232e839e316d2ba396b4623779460d...,Miller destroys the betamale from #FakeNewsCNN...,False,[FakeNewsCNN],[https://twitter.com/i/web/status/892876342718...
30177,2620614029,Protest Held Over Man Killed By Philly Police ...,False,[],[http://philadelphia.cbslocal.com/2017/06/19/p...
107,2882331822,#IfAnimalsMadeLaws all parrots would be equal ...,False,"[IfAnimalsMadeLaws, EqualityForAll]",[]
86548,2882037326,Turner and Ouch #MakeAMoviePainful,False,[MakeAMoviePainful],[]
7834,bab856e112589a34fd6fae2f463d4371bb5ec65cd2627a...,@WashDCOnline great. I stay woke about that.,False,[],[]


In [24]:
rus_df_filt.loc[80841]['tweet_text']

"Can't sleep so:   Loving,Beautiful,Can't be replaced #Momin5words #rip"

In [25]:
rus_df_filt.sample(100)

Unnamed: 0,userid,tweet_text,is_retweet,hashtags,urls
61162,0945c828c3f3fa4064a6e67cbaa1c15827f6de9f3f9e69...,RT @Pheramuse: I'll say it... #IAmSickenedBy c...,False,[IAmSickenedBy],[]
3000,2943515140,#TopNews Rapper Snoop Dogg stopped in Italy ai...,False,[TopNews],[]
55909,75146e79e40c32b350e71225c2792acdb92fcf1ae940ee...,"Help yourself, then everyone will help you.Pri...",False,,
47807,aab3b7144689bfabbb8ebb22f50aab34a43ec2d8e20551...,#mar RT MADE__USA: RT TO TAKE IT DOWN: Take d...,False,"[mar, TakeItDown, TrumpUSA, MAGA, BillClinton]",[]
93330,60f0b63e612aa72961a991ed83ad3318ccaa22f9c343ea...,http://t.co/o0T5IHcamm My workout made me feel...,False,[],[http://www.LoseFat.pw/My-workout-made-me-feel...
77932,2753146444,"LeBron helps parents earn degrees with ""I Prom...",False,"[Cleveland, news]",[http://bit.ly/1OE9pox]
8327,2753146444,Car is covered with chocolate syrup: Medina Po...,False,[crime],[]
49811,2587843805,Bridge construction causes delays for Chief fa...,False,[local],[]
14710,3899481526,"With little peace prospects, France pushes new...",False,[],[https://www.reuters.com/article/us-un-assembl...
82129,e3e600eec5fae6f4bb6e40bec6c9b04d50313bbddf6d7b...,thank you to those who still are serving our c...,False,"[RememberYourHeroes, RemembranceDay]",


In [26]:
rus_df_filt.fillna('',inplace=True)
grouper = rus_df_filt.groupby('userid').agg(list)

In [28]:
list(grouper.tweet_text.values)[:100]

[['Laugh it all off in your face',
  'The key to success is to keep growing in all areas of life - mental, emotional, spiritual, as well as physical. ~ Julius Erving',
  'You are not to blame for',
  "Politics is not perfect but it's the best available nonviolent means of changing how we live.",
  'we can have some more',
  'till you were screaming at the edge of the bed "Nobody moves, nobody gets hurt!" #true #love',
  'Politics is a love-hate relationship. I sure know that.',
  'squealer. i made her understand #smart',
  'Hear much, speak little',
  '@92ae5deba7794025dd01d545f48e0c3442a30f79dd12b53ca22318cf5e86bb20 Quote of the day',
  "Just like I did with addiction I'm 'bout to kick it",
  'sticks and stones will break your bones and leave them lying in the mud',
  'come on and let it out',
  'A sound mind in a sound body',
  '#Boston looks fantastic on photos',
  "A man's country is not a certain area of land, of mountains, rivers, and woods, but it is a principle; #USA",
  'For w