In [1]:
from wordcloud import WordCloud

# Manual PreProcessing (Lower Case All)

In [2]:
#make all lowercase
def lower_df_text(df, text_columns):
    for column in text_columns:
        df[column] = df[column].apply(lambda x: x.lower())
    return

# Generating Wordclouds

In [3]:
def plot_wordcloud(text, figname='wordcloud', rgb=(255,255,255)):
    font_path = "/Library/Fonts/DIN Condensed Bold.ttf"
    wc = WordCloud(background_color="white",max_words=25,
                   collocations=False, font_path=font_path, scale=5, color_func=lambda *args, **kwargs: rgb)
    wc.generate(text)
    
    fig, ax = plt.subplots(figsize=(14,18))
    plt.imshow(wc)
    plt.savefig('data/wordclouds/{}'.format(figname), dpi=240)


# Generating Network Graphs


In [4]:
import networkx as nx
from networkx.algorithms import community

In [5]:
# Functionize

def generate_graph(edge_filepath, node_filepath, title='gephi_graph'):
    edges = pd.read_csv(edge_filepath, header=None)
    nodes = pd.read_csv(node_filepath, header=None)
    edges.columns = ['Source', 'Target']
    nodes.columns = ['Node', 'Alias']
    nodes = nodes.astype('str')
    edges = edges.astype('str')
    edge_tuples = []
    for source, target in zip(edges['Source'], edges['Target']):
        edge_tuples.append((source, target))
    G = nx.DiGraph()
    G.add_nodes_from(nodes['Node'])
    G.add_edges_from(edge_tuples)
    print(nx.info(G))
    nx.write_gexf(G, edge_filepath.append('{}'.format(title)))
    

# Text Processing Pipeline (Too Slow)
from https://www.kaggle.com/balatmak/text-preprocessing-steps-and-universal-pipeline#Reusable-pipeline

In [6]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise

nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1, custom_stop_words=[]):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs
        self.custom_stop_words=custom_stop_words

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if t.is_stop != True and t.text not in self.custom_stop_words  and 'https' not in t.text]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])



# Democratic Convention Data


In [67]:
dconvention_df = pd.read_csv('large_data/dconvention-tweet-ids.csv')
# nodes = 15,737
# edges = 88,537
# tweets = 134,687
# total users = 41,000


In [71]:
dconvention_df

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Fri Jul 29 07:28:18 +0000 2016,DemsInPhilly HillaryClinton,,https://youtu.be/pnXiy4D_I8g,0,758927111146962945,,,,...,111,293,6,London,Tchi Mbouani,tchimbouani,832,,http://www.waterofpeace.blog,False
1,,Mon Jul 25 05:59:10 +0000 2016,LoveAndKindness DemsInPhilly ImWithHer Clinton...,https://twitter.com/LoriSums/status/7574551296...,,7,757455129687629824,,,,...,7042,6064,210,,Lori,LoriSums,482372,,,False
2,,Wed Jul 27 00:34:54 +0000 2016,GMO DemsInPhilly DemConvention,,,2,758098299501023232,,,,...,3670,2820,151,NYC,Karen K,kreativekonnect,139906,,https://www.instagram.com/karkap617/,False
3,,Fri Jul 29 03:25:26 +0000 2016,DemsInPhilly,,,6,758865994383106053,,,,...,401,286,14,"Chicago, IL",Stephanie Hoeman,stephaniehoeman,5648,,,False
4,,Mon Jul 25 05:50:31 +0000 2016,LoveAndKindness DemsInPhilly ImWithHer Clinton...,https://twitter.com/LoriSums/status/7574529512...,,5,757452951203946496,,,,...,7042,6064,210,,Lori,LoriSums,482372,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134682,,Tue Jul 26 23:17:11 +0000 2016,dnc,,,1,758078741285187585,,,,...,1235,912,97,"Boulder, CO",Jann Scott TV,JannScott,17805,,http://c1n.tv/JannScottLive/,False
134683,,Thu Jul 28 03:40:08 +0000 2016,DemsInPhilly,,,0,758507305667334145,,,,...,1153,1803,12,London,Jules Mason,1975jpm,12574,,,False
134684,,Thu Jul 28 03:40:06 +0000 2016,DemsInPhilly DemConvention,,,0,758507297005932544,BarackObama,,813286.0,...,3882,3653,185,"Silicon Valley, CA",Elizabeth #StayAwayFromCA Drescher,edrescherphd,13926,,https://www.facebook.com/elizabethdrescherphd/,False
134685,,Wed Jul 27 01:09:40 +0000 2016,,,https://twitter.com/auburnseminary/status/7579...,6,758107050283008000,,,,...,2231,2430,78,NYC,Macky Alston,MackyAlston,4560,,http://www.mackyalston.com,False


In [None]:
dnodes_df = pd.read_csv('data/democrat/convention_mentions/dmention-nodes.csv')
dnodes_df
dnodes_df = dnodes_df.join(dcommunity_df, how='left', on='Id')
dconvention_df.merge(dnodes_df, left_on='user_screen_name', right_on='Label', how='outer')['modularity_class'].dropna()

# Republican Convention Data 

In [9]:
rconvention_df = pd.read_csv('large_data/rconvention-tweet-ids.csv')
rconvention_df.describe()
rconvention_df.columns

Unnamed: 0,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,reweet_id,retweet_screen_name,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_statuses_count,user_time_zone
count,113631.0,113631.0,5650.0,12410.0,113631.0,0.0,0.0,113631.0,113631.0,113631.0,113631.0,113631.0,0.0
mean,3.855304,7.556725e+17,7.54974e+17,1.055017e+16,1.846283,,,32644.99,13164.84,2666.006187,150.543109,47951.56,
std,117.44966,470777800000000.0,1.442378e+16,8.699262e+16,52.296739,,,66404.11,481713.2,7370.316474,901.512807,85281.17,
min,0.0,7.539204e+17,2.325725e+17,12.0,0.0,,,0.0,0.0,0.0,0.0,2.0,
25%,0.0,7.552497e+17,7.552227e+17,18916430.0,0.0,,,2916.5,395.0,545.0,11.0,7076.0,
50%,0.0,7.555901e+17,7.555728e+17,40069010.0,0.0,,,10365.0,1162.0,1188.0,35.0,19659.0,
75%,1.0,7.560576e+17,7.559575e+17,254190000.0,0.0,,,32644.5,3377.0,2717.0,104.0,53377.0,
max,26220.0,7.564522e+17,7.564517e+17,7.562792e+17,11502.0,,,1053127.0,79679070.0,508859.0,116472.0,1611752.0,


Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

# Democratic Candidates Data

# Republican Candidates Data

In [64]:
import preprocessor as p
p.clean('preprocessing and it’s always better to be sure that our data is cleaned 100%.')

'preprocessing and it’s always better to be sure that our data is cleaned %.'