In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer

In [2]:
data = pd.read_csv('iranian_tweets.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122936 entries, 0 to 1122935
Data columns (total 31 columns):
tweetid                     1122936 non-null int64
userid                      1122936 non-null object
user_display_name           1122936 non-null object
user_screen_name            1122936 non-null object
user_reported_location      887669 non-null object
user_profile_description    995845 non-null object
user_profile_url            434954 non-null object
follower_count              1122936 non-null int64
following_count             1122936 non-null int64
account_creation_date       1122936 non-null object
account_language            1122936 non-null object
tweet_language              1117307 non-null object
tweet_text                  1122936 non-null object
tweet_time                  1122936 non-null object
tweet_client_name           1100078 non-null object
in_reply_to_tweetid         339350 non-null float64
in_reply_to_userid          440244 non-null object
quoted_twe

In [4]:
df = data[['userid','tweet_language','tweet_text','is_retweet','hashtags']]

In [5]:
df = df[(df.is_retweet==False)&(df.tweet_language=='en')]

In [6]:
df['tokens'] = df.tweet_text.apply(lambda x: nltk.word_tokenize(x))

In [7]:
df.tokens.values[0:3]

array([list(['@', 'ParkerLampe', 'An', 'inquiry', 'by', 'congress', 'confirms', 'that', 'ISIS', 'is', 'indeed', 'a', 'CIA', 'creation', 'http', ':', '//t.co/eFRmFwYZTV']),
       list(['@', 'hadeelhmaidi', '@', 'wordpressdotcom', 'CIA', 'predict', 'third', 'terrorist', 'attack', 'after', 'Sidney', 'and', 'Pakiistan', 'in', 'USA', 'in', '3', 'days', 'http', ':', '//t.co/IrPx7M223N']),
       list(['@', 'irfhabib', 'why', 'boko', 'haram', 'come', 'europe', ',', 'legally', 'and', 'easily', '?', 'http', ':', '//t.co/on2vzPqEPH'])],
      dtype=object)

In [15]:
def tweet_tokenize(tweet):
    twt = TweetTokenizer(reduce_len=True, strip_handles=True)
    return twt.tokenize(tweet)

In [16]:
df['tweet_tokens'] = df.tweet_text.apply(lambda x: tweet_tokenize(x))

In [34]:
df.tweet_tokens.values[5:10]

array([list(['What', 'would', 'happen', 'to', 'you', 'if', 'you', 'were', 'a', 'political', 'dissent', 'in', 'Saudi', 'Arabia', '?', 'http://t.co/n1pHsz1UsX']),
       list(['Saudi', 'embassy', 'in', 'Turkey', 'became', 'a', 'safe', 'haven', 'for', 'ISIL', 'terrorists', 'http://t.co/BsMnfC1xqU']),
       list(['We', 'can', 'destroy', 'Israel', 'in', '‘', 'less', 'than', '12', 'minutes', '’', ':', 'Pakistani', 'commander', 'https://t.co/CSkZ4Z6ke1']),
       list(['what', 'they', 'will', 'never', 'tell', 'you', 'about', 'Christmas', 'http://t.co/ovd4gGlWLE']),
       list(['is', 'there', 'a', 'secret', 'collaboration', 'between', 'U', '.', 'S', 'and', 'Iran', 'against', 'isis', '?', 'http://t.co/Fw1IH3qOCb'])],
      dtype=object)

In [37]:
def lemmatizer(tokens):
    lem = nltk.stem.WordNetLemmatizer()
    return [lem.lemmatize(i) for i in tokens]

In [38]:
df['lemma'] = df.tweet_tokens.apply(lemmatizer)

In [43]:
df.lemma.values[0:5]

array([list(['An', 'inquiry', 'by', 'congress', 'confirms', 'that', 'ISIS', 'is', 'indeed', 'a', 'CIA', 'creation', 'http://t.co/eFRmFwYZTV']),
       list(['CIA', 'predict', 'third', 'terrorist', 'attack', 'after', 'Sidney', 'and', 'Pakiistan', 'in', 'USA', 'in', '3', 'day', 'http://t.co/IrPx7M223N']),
       list(['why', 'boko', 'haram', 'come', 'europe', ',', 'legally', 'and', 'easily', '?', 'http://t.co/on2vzPqEPH']),
       list(['ISIS', 'militant', ',', 'plan', 'to', 'target', 'Western', 'capital', 'http://t.co/890VDVPE6o']),
       list(['Turkish', 'intelligence', 'chief', ':', 'ISIS', 'is', 'a', 'reality', 'and', 'we', 'are', 'optimistic', 'about', 'the', 'future', '!', 'http://t.co/8HnO2uDac2'])],
      dtype=object)

In [40]:
def strip_stops(tweet):
    stops = set(nltk.corpus.stopwords.words('english'))
    return [i for i in tweet if i not in stops]

In [41]:
df['topic_words'] = df.tweet_tokens.apply(strip_stops)

In [42]:
df.topic_words.values[0:5]

array([list(['An', 'inquiry', 'congress', 'confirms', 'ISIS', 'indeed', 'CIA', 'creation', 'http://t.co/eFRmFwYZTV']),
       list(['CIA', 'predict', 'third', 'terrorist', 'attack', 'Sidney', 'Pakiistan', 'USA', '3', 'days', 'http://t.co/IrPx7M223N']),
       list(['boko', 'haram', 'come', 'europe', ',', 'legally', 'easily', '?', 'http://t.co/on2vzPqEPH']),
       list(['ISIS', 'militants', ',', 'plan', 'target', 'Western', 'capitals', 'http://t.co/890VDVPE6o']),
       list(['Turkish', 'intelligence', 'chief', ':', 'ISIS', 'reality', 'optimistic', 'future', '!', 'http://t.co/8HnO2uDac2'])],
      dtype=object)