In [1]:
from wordcloud import WordCloud

# Manual PreProcessing (Lower Case All)

In [2]:
#make all lowercase
def lower_df_text(df, text_columns):
    for column in text_columns:
        df[column] = df[column].apply(lambda x: x.lower())
    return

# Generating Wordclouds

In [3]:
def plot_wordcloud(text, figname='wordcloud', rgb=(255,255,255)):
    font_path = "/Library/Fonts/DIN Condensed Bold.ttf"
    wc = WordCloud(background_color="white",max_words=25,
                   collocations=False, font_path=font_path, scale=5, color_func=lambda *args, **kwargs: rgb)
    wc.generate(text)
    
    fig, ax = plt.subplots(figsize=(14,18))
    plt.imshow(wc)
    plt.savefig('data/wordclouds/{}'.format(figname), dpi=240)


# Generating Network Graphs


In [4]:
import networkx as nx
from networkx.algorithms import community

In [5]:
# Functionize

def generate_graph(edge_filepath, node_filepath, title='gephi_graph'):
    edges = pd.read_csv(edge_filepath, header=None)
    nodes = pd.read_csv(node_filepath, header=None)
    edges.columns = ['Source', 'Target']
    nodes.columns = ['Node', 'Alias']
    nodes = nodes.astype('str')
    edges = edges.astype('str')
    edge_tuples = []
    for source, target in zip(edges['Source'], edges['Target']):
        edge_tuples.append((source, target))
    G = nx.DiGraph()
    G.add_nodes_from(nodes['Node'])
    G.add_edges_from(edge_tuples)
    print(nx.info(G))
    nx.write_gexf(G, edge_filepath.append('{}'.format(title)))
    

# Text Processing Pipeline (Too Slow)
from https://www.kaggle.com/balatmak/text-preprocessing-steps-and-universal-pipeline#Reusable-pipeline

In [6]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise

nlp = en_core_web_sm.load()


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1, custom_stop_words=[]):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs
        self.custom_stop_words=custom_stop_words

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if t.is_stop != True and t.text not in self.custom_stop_words  and 'https' not in t.text]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])



# Democratic Convention Data


In [8]:
dconvention_df = pd.read_csv('large_data/dconvention-tweet-ids.csv')
dconvention_df.describe()
dconvention_df.columns

Unnamed: 0,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,reweet_id,retweet_screen_name,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_statuses_count,user_time_zone
count,134687.0,134687.0,7540.0,17244.0,134687.0,0.0,0.0,134687.0,134687.0,134687.0,134687.0,134687.0,0.0
mean,3.345653,7.582166e+17,7.576638e+17,1.624101e+16,1.681061,,,32161.83,9140.628,2767.032609,133.059946,47978.42,
std,93.861568,522653900000000.0,1.226524e+16,1.07366e+17,60.445901,,,66001.0,91326.16,7243.250845,484.443211,92825.56,
min,0.0,7.564538e+17,4.238422e+16,12.0,0.0,,,0.0,0.0,0.0,0.0,1.0,
25%,0.0,7.577726e+17,7.576961e+17,16511950.0,0.0,,,2686.0,392.0,520.0,11.0,6261.0,
50%,0.0,7.581443e+17,7.58065e+17,55245920.0,0.0,,,10194.0,1110.0,1166.0,34.0,17799.0,
75%,1.0,7.586317e+17,7.584846e+17,526689900.0,0.0,,,31723.0,3352.0,2645.0,97.0,49175.0,
max,30492.0,7.594183e+17,7.594173e+17,7.591036e+17,20768.0,,,1053126.0,13724330.0,317437.0,30229.0,2230300.0,


Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

In [36]:
#create a node df with summary columns for each user
dnode_features = [
       'favorite_count', 'place',
       'retweet_count',
       'text', 
       'user_screen_name', 'user_description',
       'user_followers_count',
       'user_location'
       ]
dnode_df = dconvention_df[dnode_features].copy()
#fill in 'place' column if isNaN, using the location of the User.
nan_map = dnode_df['place'].isna()
dnode_df['place'][nan_map] = dnode_df['user_location'][nan_map]
#group by user_id and sum various metrics for the user
dnode_df = dnode_df.groupby(['user_screen_name', 'place', 'user_description']).sum()
dnode_df = dnode_df.reset_index().set_index('user_screen_name')
dnode_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0_level_0,place,user_description,favorite_count,retweet_count,user_followers_count
user_screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0000livia,"Suffolk, VA",Jesus Lover|Wife of @qfranklin23 | Mother | He...,1,0,546
01RedZ28,"Park Ridge, NJ","New York Giants, New York Rangers, New York Ya...",0,0,114
033sdn,"Boardman, OH","it was the age of wisdom, it was the age of fo...",0,0,1582
08meghana,"Palatine, IL",Mumbaite living in USA🌎,6,1,4030
0SamanthaRae0,"Port St John, FL","Im a mother, a sister and a daughter!♡",1,0,65
...,...,...,...,...,...
zwoebe,"Groningen, Nederland",Drinkt Krombacher met koerspetje op z'n kop. D...,1,0,711
zx10arlo,"Omaha, NE",Founder of Mav Motors at UNO. Car and Bike En...,3,0,265
zxcvanessa,"Oakley, CA","always forward, never back",0,0,394
zyo5,"Hoboken, NJ",tweetin' machine.,0,0,236


In [63]:
#join the node data onto the mention nodes csv
dmentions = pd.read_csv('data/democrat/convention_mentions/dmention-nodes.csv', index_col='Label')
dmentions
dmentions.merge(dnode_df, left_index=True, right_index=True, how='left')
dmentions.nunique

Unnamed: 0_level_0,Id
Label,Unnamed: 1_level_1
weeklystandard,17546958
realDonaldTrump,25073877
HillaryClinton,1339835893
Natire2u,255645890
AnitaDWhitee,909448512
...,...
LatinoCommFdn,224810490
cspanMatthew,576520016
ShellsBells143,616692598
RevJacquiLewis,587591389


Unnamed: 0,Id,place,user_description,favorite_count,retweet_count,user_followers_count
01elibo,2275927727,,,,,
09072021,388150165,,,,,
0Sundance,425116151,,,,,
1,18899974,,,,,
100DaysOfHRC,758675398913765376,,,,,
...,...,...,...,...,...,...
zstollar,620473700,,,,,
ztsamudzi,2968635633,,,,,
zuma02,221642866,,,,,
zxnaida,1070209219,,,,,


<bound method DataFrame.nunique of                          Id
Label                      
weeklystandard     17546958
realDonaldTrump    25073877
HillaryClinton   1339835893
Natire2u          255645890
AnitaDWhitee      909448512
...                     ...
LatinoCommFdn     224810490
cspanMatthew      576520016
ShellsBells143    616692598
RevJacquiLewis    587591389
RevKHenderson    1289348384

[15737 rows x 1 columns]>

# Republican Convention Data 

In [9]:
rconvention_df = pd.read_csv('large_data/rconvention-tweet-ids.csv')
rconvention_df.describe()
rconvention_df.columns

Unnamed: 0,favorite_count,id,in_reply_to_status_id,in_reply_to_user_id,retweet_count,reweet_id,retweet_screen_name,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_statuses_count,user_time_zone
count,113631.0,113631.0,5650.0,12410.0,113631.0,0.0,0.0,113631.0,113631.0,113631.0,113631.0,113631.0,0.0
mean,3.855304,7.556725e+17,7.54974e+17,1.055017e+16,1.846283,,,32644.99,13164.84,2666.006187,150.543109,47951.56,
std,117.44966,470777800000000.0,1.442378e+16,8.699262e+16,52.296739,,,66404.11,481713.2,7370.316474,901.512807,85281.17,
min,0.0,7.539204e+17,2.325725e+17,12.0,0.0,,,0.0,0.0,0.0,0.0,2.0,
25%,0.0,7.552497e+17,7.552227e+17,18916430.0,0.0,,,2916.5,395.0,545.0,11.0,7076.0,
50%,0.0,7.555901e+17,7.555728e+17,40069010.0,0.0,,,10365.0,1162.0,1188.0,35.0,19659.0,
75%,1.0,7.560576e+17,7.559575e+17,254190000.0,0.0,,,32644.5,3377.0,2717.0,104.0,53377.0,
max,26220.0,7.564522e+17,7.564517e+17,7.562792e+17,11502.0,,,1053127.0,79679070.0,508859.0,116472.0,1611752.0,


Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

# Democratic Candidates Data

# Republican Candidates Data

In [64]:
import preprocessor as p
p.clean('preprocessing and it’s always better to be sure that our data is cleaned 100%.')

'preprocessing and it’s always better to be sure that our data is cleaned %.'