In [17]:
# !pip install -U spacy

In [18]:
# !python -m spacy download en_core_web_sm

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



import regex as re
import unicodedata

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, pairwise_distances

import spacy
import en_core_web_sm
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS




pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [20]:
# Importing the finalized concatinated dataframe
data = pd.read_csv('../data/concat_df.csv')

data.shape

(626, 8)

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626 entries, 0 to 625
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   screen_name  626 non-null    object
 1   state        626 non-null    object
 2   position     626 non-null    object
 3   name         626 non-null    object
 4   party        626 non-null    object
 5   tweet_count  626 non-null    int64 
 6   text_concat  626 non-null    object
 7   text_length  626 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 39.2+ KB


In [22]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"New apartments, new businesses and new restaur...",2801192
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"Literally, the easiest fix we could make to sa...",1843540


In [23]:

data.isnull().sum()

screen_name    0
state          0
position       0
name           0
party          0
tweet_count    0
text_concat    0
text_length    0
dtype: int64

In [24]:
# making sure there are no spaces at start or end of cells
data['screen_name'] = [i.strip() for i in data['screen_name']]
data['state'] = [i.strip() for i in data['state']]
data['position'] = [i.strip() for i in data['position']]
data['name'] = [i.strip() for i in data['name']]
data['party'] = [i.strip() for i in data['party']]
data['text_concat'] = [i.strip() for i in data['text_concat']]

data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"New apartments, new businesses and new restaur...",2801192
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"Literally, the easiest fix we could make to sa...",1843540


In [25]:
# Removing all emojis
# HELP:  https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1#file-strip_emoji-py

remove_emojis = re.compile(
    "(["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "])"
  )

data['text_concat'] = [re.sub(remove_emojis, '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of emojis
data['text_length_1'] = data['text_concat'].apply(len)

In [26]:
# Removing hyperlinks from tweet text
# HELP: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
data['text_concat'] = [re.sub(r'^https?:\/\/.*[\r\n]*', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of hyperlinks
data['text_length_2'] = data['text_concat'].apply(len)

In [27]:
# Remove punctuation, special characters, and numbers
data['text_concat'] = [re.sub(r'[^a-zA-Z0-9 ]+', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of any other special characters, punctuation, numbers
data['text_length_3'] = data['text_concat'].apply(len)

In [28]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609
3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050,7085835,7085835,6576425
4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,NetNeutralityReminder Obama used 1934 FCC rule...,958737,958292,958292,886492


In [36]:
# Count the number of words for each user
# HELP: https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe

data['word_length'] = data['text_concat'].str.split(' ').str.len()

In [37]:
data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609,223536
3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050,7085835,7085835,6576425,839968
4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,NetNeutralityReminder Obama used 1934 FCC rule...,958737,958292,958292,886492,112053


In [44]:
data['tweet_count'].describe()

count      626.000000
mean      2384.752396
std       2141.344896
min          6.000000
25%        949.250000
50%       1890.000000
75%       3164.500000
max      18325.000000
Name: tweet_count, dtype: float64

In [43]:
data['word_length'].describe()

count       626.000000
mean      88128.678914
std       82571.864571
min         227.000000
25%       33350.000000
50%       68001.000000
75%      118017.250000
max      839968.000000
Name: word_length, dtype: float64

In [45]:
data['text_length_3'].describe()

count    6.260000e+02
mean     6.949572e+05
std      6.524310e+05
min      1.461000e+03
25%      2.630642e+05
50%      5.425705e+05
75%      9.337525e+05
max      6.576425e+06
Name: text_length_3, dtype: float64