# Processing Text

Steps:
   - lowercase
   - remove x
   - tokenize
   - remove stopwords
   - lemmatize

['one,',
 'political,',
 'philly,',
 'dnc2016,',
 'dncinphl,',
 'convention,',
 'democratic,',
 'dnc,',
 'demconvention,',
 'things,',
 'talk,',
 'like,',
 'national,',
 'pennsylvania,',
 'speech,',
 'live']

In [18]:
import pandas as pd
import csv
import re #regular expression
from textblob import TextBlob
import string
import preprocessor as p
from nltk.tokenize import word_tokenize

custom_stop_words =  {'demsinphilly','one',
 'democrats',
 'political',
 'philly',
 'dnc2016',
 'dncinphl',
 'convention',
 'democratic',
 'dnc',
 'demconvention',
 'things',
 'talk',
 'like',
 'national',
 'pennsylvania',
 'speech',
 'live',"n't","'re","'s","doesn't", 'in', 'down', 'you', 's', "weren't", 'before', 'mustn', 'shan', "isn't", 'mightn', 'than', 'aren', 'that', 'from', "that'll", 'wasn', "should've", 'weren', 'do', 'once', 'those', "aren't", 'no', 'hasn', 'himself', 'own', 'shouldn', 'does', 'all', 'your', 'isn', 'm', 'has', 'been', 'these', "hadn't", 'because', 'with', 'having', 'didn', 'up', 'myself', 'between', 'for', 'is', 'at', 'where', 'and', 'be', 'won', 'll', 'or', "mightn't", 'above', 'too', "you'll", "won't", 'he', 'have', 'into', 'of', 'after', 'ma', 'just', "didn't", 'am', 'but', 'couldn', 'the', 'nor', 'being', 'same', 'will', 'theirs', 'while', 'again', 't', 'can', "shan't", 'off', 'any', 'd', 'by', 'our', 'some', 'so', 'themselves', 'what', 'an', 'i', 'only', 'as', 'are', "you're", 'they', 'both', 'them', 'very', 'other', 'a', 'few', "you've", "wasn't", 'yourselves', 'over', "don't", 've', 'against', 'below', 'was', 'my', 'out', 'doing', "needn't", 'did', 'about', "hasn't", 'whom', 'how', 'why', 'further', 'o', 'on', 'not', 'ain', "you'd", 'here', 'y', 'if', "haven't", 'she', 'during', 'we', 'ours', 'through', 'then', 'her', "she's", 'their', 'which', 'don', 'hadn', 'its', 'yours', 'had', 'me', 'his', 'hers', 'when', 're', 'most', 'itself', "it's", 'under', 'now', 'to', 'him', "mustn't", 'yourself', 'should', 'until', 'it', "couldn't", 'who', 'each', "shouldn't", 'there', 'ourselves', 'haven', 'wouldn', 'herself', "wouldn't", 'needn', 'were', 'more', 'this', 'doesn', 'such'}


p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.NUMBER)

#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)
#combine sad and happy emoticons
emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):
    tweet=p.clean(tweet).lower()
    #stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
#after tweepy preprocessing the colon symbol left remain after      #removing mentions
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
#replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
#remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
#filter using NLTK library append it to a string
    filtered_tweet = [w for w in word_tokens if not w in custom_stop_words]
    filtered_tweet = []
#looping through conditions
    for w in word_tokens:
#check tokens against stop words , emoticons and punctuations
        if w not in custom_stop_words and w not in emoticons and w not in string.punctuation:
            filtered_tweet.append(w)
    return ' '.join(filtered_tweet)
    #print(word_tokens)
    #print(filtered_sentence)return tweet

## Democratic Convention  

In [14]:
dconvention_df = pd.read_csv('large_data/dconvention-tweet-ids.csv')
dconvention_df.describe()
dconvention_df.columns

Unnamed: 0,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,reweet_id,retweet_screen_name,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_statuses_count,user_time_zone
count,134687.0,134687.0,7540.0,17244.0,134687.0,0.0,0.0,134687.0,134687.0,134687.0,134687.0,134687.0,0.0
mean,3.345653,7.582166e+17,7.576638e+17,1.624101e+16,1.681061,,,32161.83,9140.628,2767.032609,133.059946,47978.42,
std,93.861568,522653900000000.0,1.226524e+16,1.07366e+17,60.445901,,,66001.0,91326.16,7243.250845,484.443211,92825.56,
min,0.0,7.564538e+17,4.238422e+16,12.0,0.0,,,0.0,0.0,0.0,0.0,1.0,
25%,0.0,7.577726e+17,7.576961e+17,16511950.0,0.0,,,2686.0,392.0,520.0,11.0,6261.0,
50%,0.0,7.581443e+17,7.58065e+17,55245920.0,0.0,,,10194.0,1110.0,1166.0,34.0,17799.0,
75%,1.0,7.586317e+17,7.584846e+17,526689900.0,0.0,,,31723.0,3352.0,2645.0,97.0,49175.0,
max,30492.0,7.594183e+17,7.594173e+17,7.591036e+17,20768.0,,,1053126.0,13724330.0,317437.0,30229.0,2230300.0,


Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

In [19]:
#clean text column
dconvention_df['text'] = dconvention_df['text'].apply(lambda x: clean_tweets(x))

In [20]:
#clean description column
dconvention_df['user_description'].fillna(value='', inplace=True)
dconvention_df['user_description'] = dconvention_df['user_description'].apply(lambda x: clean_tweets(x))

In [21]:
#save the cleaned text df
dconvention_df.to_csv('/Users/jacobmullins/data-science-immersive/capstone_2/large_data/dconvention_df.csv')

In [111]:
#open labeled community nodes
dcommunity_df = pd.read_csv('large_data/communities_dconvention.csv', index_col='Id')
dcommunity_df.drop(columns=['timeset'], inplace=True)
dcommunity_df.drop(columns=['Label'], inplace=True)
dcommunity_df

Unnamed: 0_level_0,modularity_class
Id,Unnamed: 1_level_1
187059363,29
17546958,29
551557346,18
25073877,29
1339835893,18
...,...
224810490,196
576520016,70
616692598,70
587591389,18


In [112]:
dnodes_df = pd.read_csv('data/democrat/convention_mentions/dmention-nodes.csv')
dnodes_df

Unnamed: 0,Id,Label
0,17546958,weeklystandard
1,25073877,realDonaldTrump
2,1339835893,HillaryClinton
3,255645890,Natire2u
4,909448512,AnitaDWhitee
...,...,...
15732,224810490,LatinoCommFdn
15733,576520016,cspanMatthew
15734,616692598,ShellsBells143
15735,587591389,RevJacquiLewis


## Republican Convention 

In [79]:
rconvention_df = pd.read_csv('large_data/rconvention-tweet-ids.csv')
rconvention_df.describe()
rconvention_df.columns

Unnamed: 0,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,reweet_id,retweet_screen_name,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_statuses_count,user_time_zone
count,113631.0,113631.0,5650.0,12410.0,113631.0,0.0,0.0,113631.0,113631.0,113631.0,113631.0,113631.0,0.0
mean,3.855304,7.556725e+17,7.54974e+17,1.055017e+16,1.846283,,,32644.99,13164.84,2666.006187,150.543109,47951.56,
std,117.44966,470777800000000.0,1.442378e+16,8.699262e+16,52.296739,,,66404.11,481713.2,7370.316474,901.512807,85281.17,
min,0.0,7.539204e+17,2.325725e+17,12.0,0.0,,,0.0,0.0,0.0,0.0,2.0,
25%,0.0,7.552497e+17,7.552227e+17,18916430.0,0.0,,,2916.5,395.0,545.0,11.0,7076.0,
50%,0.0,7.555901e+17,7.555728e+17,40069010.0,0.0,,,10365.0,1162.0,1188.0,35.0,19659.0,
75%,1.0,7.560576e+17,7.559575e+17,254190000.0,0.0,,,32644.5,3377.0,2717.0,104.0,53377.0,
max,26220.0,7.564522e+17,7.564517e+17,7.562792e+17,11502.0,,,1053127.0,79679070.0,508859.0,116472.0,1611752.0,


Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

In [80]:
#clean text column
rconvention_df['text'] = rconvention_df['text'].apply(lambda x: clean_tweets(x))
#clean description column
rconvention_df['user_description'].fillna(value='', inplace=True)
rconvention_df['user_description'] = rconvention_df['user_description'].apply(lambda x: clean_tweets(x))

#clean the hashtags so we can re-add them to the tweet text
dconvention_df['hashtags'].fillna(value='', inplace=True)
dconvention_df['hashtags'] = dconvention_df['hashtags'].apply(lambda x: clean_tweets(x))
#add the hashtags back
dconvention_df['text'] = dconvention_df['text'] + dconvention_df['hashtags']

In [81]:
#save the cleaned text df
rconvention_df.to_csv('/Users/jacobmullins/data-science-immersive/capstone_2/large_data/rconvention_df.csv')