In [1]:
# Checking to see if GPU drive is active
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Apr 28 12:21:30 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Checking to see if GPU drive is active
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [1]:
# If running in google colab:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install -U spacy

In [2]:
# !python -m spacy download en_core_web_sm

# Download en_core_web_lg for google colab
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.1MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180944 sha256=ef514488426ad7f0e40bef2687aadf4c9e07f5a7cd690cb908edf855bbd31f35
  Stored in directory: /tmp/pip-ephem-wheel-cache-khdvwsj2/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import regex as re
# import unicodedata

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, pairwise_distances

import spacy
# import en_core_web_sm
import en_core_web_lg

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Loading the small (sm) spaCy English model, pre-trained on web text data
# nlp = en_core_web_sm.load()

# Loading the large (sm) spaCy English model, pre-trained on web text data
# In order to use vectorization, you must use the large pipeline package
# https://spacy.io/usage/spacy-101
'''
"To make them compact and fast, spaCy’s small pipeline packages (all packages that end in sm) 
don’t ship with word vectors, and only include context-sensitive tensors... 
So in order to use real word vectors, you need to download a larger pipeline package."
'''
nlp = en_core_web_lg.load()

# max length must be increased due to large size of each document
nlp.max_length = 10000000 # or higher


In [28]:
# # Importing the finalized concatinated dataframe
# # Use this line if running locally
# data = pd.read_csv('../data/concat_df.csv')

# If running in google colab, use this import line:
# Change to file location if necessary
data = pd.read_csv('drive/MyDrive/Python/concat_df.csv')

data.shape

(524, 8)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   screen_name  524 non-null    object
 1   state        524 non-null    object
 2   position     524 non-null    object
 3   name         524 non-null    object
 4   party        524 non-null    object
 5   tweet_count  524 non-null    int64 
 6   text_concat  524 non-null    object
 7   text_length  524 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 32.9+ KB


In [20]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight: “The president had staged an elaborat...,1623390
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476
2,justinamash,MI,Representative,Justin Amash,L,3414,"Rioting and looting—that’s selfish, destructiv...",811799


In [21]:
data.isnull().sum()

screen_name    0
state          0
position       0
name           0
party          0
tweet_count    0
text_concat    0
text_length    0
dtype: int64

In [29]:
# making sure there are no spaces at start or end of cells
data['screen_name'] = [i.strip() for i in data['screen_name']]
data['state'] = [i.strip() for i in data['state']]
data['position'] = [i.strip() for i in data['position']]
data['name'] = [i.strip() for i in data['name']]
data['party'] = [i.strip() for i in data['party']]
data['text_concat'] = [i.strip() for i in data['text_concat']]

data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight: “The president had staged an elaborat...,1623390
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476
2,justinamash,MI,Representative,Justin Amash,L,3414,"Rioting and looting—that’s selfish, destructiv...",811799


In [30]:
# Removing hyperlinks from tweet text
# HELP: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
data['text_concat'] = [re.sub(r'http\S+', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of hyperlinks
data['text_length_1'] = data['text_concat'].apply(len)

In [32]:
# Removing twitter handles from tweet text
# HELP: https://stackoverflow.com/questions/54733828/remove-twitter-mentions-from-pandas-column
data['text_concat'] = [re.sub(r'@[A-Za-z0-9]+', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of hyperlinks
data['text_length_2'] = data['text_concat'].apply(len)

In [33]:
# Removing all emojis
# HELP:  https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1#file-strip_emoji-py

remove_emojis = re.compile(
    "(["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "])"
  )

data['text_concat'] = [re.sub(remove_emojis, '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of emojis
data['text_length_3'] = data['text_concat'].apply(len)

In [34]:
# Remove punctuation, special characters, and numbers
data['text_concat'] = [re.sub(r'[^a-zA-Z0-9 ]+', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of any other special characters, punctuation, numbers
data['text_length_4'] = data['text_concat'].apply(len)

In [35]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,text_length_4
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1085547,1030695,1030438,996854
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,962886,902590,901653,864569
2,justinamash,MI,Representative,Justin Amash,L,3414,Rioting and lootingthats selfish destructive c...,811799,683172,639815,639651,616663


In [36]:
data['text_concat'][0]



In [16]:
# Count the number of words for each user
# HELP: https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe

data['word_length'] = data['text_concat'].str.split(' ').str.len()

In [17]:
data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1623133,1623133,1498873,178283
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,1312539,1312539,1214291,154634
2,justinamash,MI,Representative,Justin Amash,L,3414,Rioting and lootingthats selfish destructive c...,811799,811635,811635,764722,108476
3,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,Its important that every American have the cha...,2076035,2075558,2075558,1928216,264882
4,NormaJTorres,CA,Representative,Norma Torres,D,3198,jolingkent take care thank you for continuing ...,1154739,1154021,1154021,1066304,132885


In [18]:
data['tweet_count'].describe()

count      524.000000
mean      2127.862595
std       1678.358685
min         10.000000
25%       1064.000000
50%       1735.000000
75%       2774.250000
max      13257.000000
Name: tweet_count, dtype: float64

In [19]:
data['word_length'].describe()

count       524.000000
mean      80336.150763
std       65900.758106
min         341.000000
25%       38379.750000
50%       65114.000000
75%      103038.500000
max      572737.000000
Name: word_length, dtype: float64

In [20]:
data['text_length_3'].describe()

count    5.240000e+02
mean     6.334546e+05
std      5.235732e+05
min      2.205000e+03
25%      3.049430e+05
50%      5.145025e+05
75%      8.133498e+05
max      4.446964e+06
Name: text_length_3, dtype: float64

In [21]:
# See the list of spaCy stopwords
stopwords = list(STOP_WORDS)

print(f'SpaCy stop word list length: {len(stopwords)}')
print('--------------------------------')
print(f'SpaCy stop word list: {stopwords}')

SpaCy stop word list length: 326
--------------------------------
SpaCy stop word list: ['latterly', 'was', 'thereupon', 'toward', 'however', 'too', 'again', 'elsewhere', 'several', 'whatever', 'onto', 'wherein', '’ll', 'everyone', 'while', '’d', 'why', 'here', 'there', 'former', 'through', 'say', 'as', 'not', 'forty', 'below', 'those', "'m", 'sometimes', '‘d', 'do', 'n’t', 'these', 'none', 'along', 'any', 'part', 'front', 'or', 'seem', 'might', 'across', 'either', 'unless', 'hundred', 'before', 'but', 'two', 'ours', 'did', 'beforehand', 'full', '‘re', 'everywhere', 'seems', 'ever', 'wherever', 'regarding', 'without', 'whose', 'after', 'are', 'than', 'nothing', 'four', 'own', 'anything', 're', 'seeming', 'some', 'him', 'see', 'on', 'by', 'show', 'his', 'your', 'above', 'namely', 'thence', 'may', 'other', 'itself', 'herein', 'them', 'an', 'nevertheless', 'thru', 'beyond', 'us', '’ve', 'whence', '’s', 'anyhow', 'almost', 'yourself', 'whereby', 'nobody', 'make', 'top', 'upon', 'hers', 'ag

In [22]:
# contractions in the list of spaCy stop words
# HELP understanding .isalpha(): https://www.geeksforgeeks.org/python-string-isalpha-application/
contraction_parts = [word for word in stopwords if not word.isalpha()]

print(len(contraction_parts))
print('--------------------------------')
print(contraction_parts)

21
--------------------------------
['’ll', '’d', "'m", '‘d', 'n’t', '‘re', '’ve', '’s', '‘s', '’re', "'ll", '‘ll', '’m', "n't", '‘ve', "'ve", "'re", "'s", "'d", '‘m', 'n‘t']


In [23]:
# Remove contractions to use w/ CountVectorizer
stopwords_full_words = [word for word in stopwords if word.isalpha()]

print(len(stopwords_full_words))
print('--------------------------------')
print(stopwords_full_words)

305
--------------------------------
['latterly', 'was', 'thereupon', 'toward', 'however', 'too', 'again', 'elsewhere', 'several', 'whatever', 'onto', 'wherein', 'everyone', 'while', 'why', 'here', 'there', 'former', 'through', 'say', 'as', 'not', 'forty', 'below', 'those', 'sometimes', 'do', 'these', 'none', 'along', 'any', 'part', 'front', 'or', 'seem', 'might', 'across', 'either', 'unless', 'hundred', 'before', 'but', 'two', 'ours', 'did', 'beforehand', 'full', 'everywhere', 'seems', 'ever', 'wherever', 'regarding', 'without', 'whose', 'after', 'are', 'than', 'nothing', 'four', 'own', 'anything', 're', 'seeming', 'some', 'him', 'see', 'on', 'by', 'show', 'his', 'your', 'above', 'namely', 'thence', 'may', 'other', 'itself', 'herein', 'them', 'an', 'nevertheless', 'thru', 'beyond', 'us', 'whence', 'anyhow', 'almost', 'yourself', 'whereby', 'nobody', 'make', 'top', 'upon', 'hers', 'against', 'into', 'is', 'anywhere', 'yours', 'amongst', 'among', 'down', 'behind', 'he', 'three', 'anyway

In [24]:
data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1623133,1623133,1498873,178283
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,1312539,1312539,1214291,154634
2,justinamash,MI,Representative,Justin Amash,L,3414,Rioting and lootingthats selfish destructive c...,811799,811635,811635,764722,108476
3,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,Its important that every American have the cha...,2076035,2075558,2075558,1928216,264882
4,NormaJTorres,CA,Representative,Norma Torres,D,3198,jolingkent take care thank you for continuing ...,1154739,1154021,1154021,1066304,132885


In [25]:
# Find the 50 most common words in my corpus 
X = data['text_concat']

cvec = CountVectorizer(max_features=50, stop_words=stopwords_full_words)

cvec.fit(X);

In [26]:
print(cvec.get_feature_names())

['act', 'american', 'americans', 'amp', 'bill', 'bipartisan', 'care', 'communities', 'community', 'congress', 'continue', 'coronavirus', 'country', 'covid19', 'crisis', 'day', 'families', 'federal', 'great', 'health', 'help', 'house', 'im', 'like', 'national', 'need', 'new', 'people', 'president', 'protect', 'proud', 'public', 'qt', 'realdonaldtrump', 'right', 'rt', 'senate', 'state', 'support', 'thank', 'time', 'today', 'trump', 'vote', 'week', 'women', 'work', 'workers', 'working', 'years']


In [27]:
# should I remove @tweet_handles?

In [28]:
# Adding some common words from above code
custom_stopwords = ['rt', 'qt', 'im', 'amp']
for word in custom_stopwords:
    stopwords.append(word)

# Verify function worked & words were appended
print(stopwords[-len(custom_stopwords):])
print('-----------')
print(stopwords)

['rt', 'qt', 'im']
-----------
['latterly', 'was', 'thereupon', 'toward', 'however', 'too', 'again', 'elsewhere', 'several', 'whatever', 'onto', 'wherein', '’ll', 'everyone', 'while', '’d', 'why', 'here', 'there', 'former', 'through', 'say', 'as', 'not', 'forty', 'below', 'those', "'m", 'sometimes', '‘d', 'do', 'n’t', 'these', 'none', 'along', 'any', 'part', 'front', 'or', 'seem', 'might', 'across', 'either', 'unless', 'hundred', 'before', 'but', 'two', 'ours', 'did', 'beforehand', 'full', '‘re', 'everywhere', 'seems', 'ever', 'wherever', 'regarding', 'without', 'whose', 'after', 'are', 'than', 'nothing', 'four', 'own', 'anything', 're', 'seeming', 'some', 'him', 'see', 'on', 'by', 'show', 'his', 'your', 'above', 'namely', 'thence', 'may', 'other', 'itself', 'herein', 'them', 'an', 'nevertheless', 'thru', 'beyond', 'us', '’ve', 'whence', '’s', 'anyhow', 'almost', 'yourself', 'whereby', 'nobody', 'make', 'top', 'upon', 'hers', 'against', 'into', 'is', 'anywhere', 'yours', 'amongst', '‘s

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   screen_name    524 non-null    object
 1   state          524 non-null    object
 2   position       524 non-null    object
 3   name           524 non-null    object
 4   party          524 non-null    object
 5   tweet_count    524 non-null    int64 
 6   text_concat    524 non-null    object
 7   text_length    524 non-null    int64 
 8   text_length_1  524 non-null    int64 
 9   text_length_2  524 non-null    int64 
 10  text_length_3  524 non-null    int64 
 11  word_length    524 non-null    int64 
dtypes: int64(6), object(6)
memory usage: 49.2+ KB


In [30]:
# Function to to remove custom stop words list 
def remove_stopwords(doc):
    no_stopwords_doc = []
    for token in doc:
        if token.text.lower() not in stopwords:
            no_stopwords_doc.append(token)
    return no_stopwords_doc

In [31]:
# Function to only keep nouns, verbs, adjectives, adverbs
def keep_nous_verbs_adj_adv(token_list):
    keep_nvaa = [token for token in token_list if token.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']]
    return keep_nvaa

In [32]:
# Function to lemmatize the tokens into a list of strings
def lemmatizer(token_list):
    lem_tokens = [token.lemma_ for token in token_list]
    return lem_tokens

In [33]:
# Function to return a list of Tokens
def process_text(text):
    text = nlp(text)
    token_list = remove_stopwords(text)
    token_list = keep_nous_verbs_adj_adv(token_list) 
    final_string = lemmatizer(token_list)
    return final_string

In [34]:
# Define a function to turn a list of tokens into a spaCy Doc
def make_spacy_doc(final_string):
    spacy_doc = nlp(' '.join(final_string))
    return spacy_doc

In [35]:
# Create a new column of processed spaCy docs
# call process text function to invoke all functions
data['text_concat_clean'] = [process_text(text_string) for text_string in data['text_concat']]

data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1623133,1623133,1498873,178283,"[tonight, president, stage, elaborate, photo, ..."
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,1312539,1312539,1214291,154634,"[hell, Donald, Trump, think, teargasse, peacef..."
2,justinamash,MI,Representative,Justin Amash,L,3414,Rioting and lootingthats selfish destructive c...,811799,811635,811635,764722,108476,"[rioting, lootingthat, selfish, destructive, c..."
3,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,Its important that every American have the cha...,2076035,2075558,2075558,1928216,264882,"[important, American, chance, hear, network, n..."
4,NormaJTorres,CA,Representative,Norma Torres,D,3198,jolingkent take care thank you for continuing ...,1154739,1154021,1154021,1066304,132885,"[jolingkent, care, thank, continue, report, fi..."


In [36]:
# saving cleaned file to google drive
data.to_csv('drive/MyDrive/Python/concat_df_clean_lg.csv', index=False)

In [37]:
data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1623133,1623133,1498873,178283,"[tonight, president, stage, elaborate, photo, ..."
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,1312539,1312539,1214291,154634,"[hell, Donald, Trump, think, teargasse, peacef..."
2,justinamash,MI,Representative,Justin Amash,L,3414,Rioting and lootingthats selfish destructive c...,811799,811635,811635,764722,108476,"[rioting, lootingthat, selfish, destructive, c..."
3,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,Its important that every American have the cha...,2076035,2075558,2075558,1928216,264882,"[important, American, chance, hear, network, n..."
4,NormaJTorres,CA,Representative,Norma Torres,D,3198,jolingkent take care thank you for continuing ...,1154739,1154021,1154021,1066304,132885,"[jolingkent, care, thank, continue, report, fi..."


(524, 12)

In [7]:
# read in saved data in order to vectorize it
data = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg.csv')

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean
0,0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1623133,1623133,1498873,178283,"['tonight', 'president', 'stage', 'elaborate',..."
1,1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,1312539,1312539,1214291,154634,"['hell', 'Donald', 'Trump', 'think', 'teargass..."
2,2,justinamash,MI,Representative,Justin Amash,L,3414,Rioting and lootingthats selfish destructive c...,811799,811635,811635,764722,108476,"['rioting', 'lootingthat', 'selfish', 'destruc..."
3,3,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,Its important that every American have the cha...,2076035,2075558,2075558,1928216,264882,"['important', 'American', 'chance', 'hear', 'n..."
4,4,NormaJTorres,CA,Representative,Norma Torres,D,3198,jolingkent take care thank you for continuing ...,1154739,1154021,1154021,1066304,132885,"['jolingkent', 'care', 'thank', 'continue', 'r..."


In [8]:
data.shape

(524, 14)

In [13]:
131+131+131+131

524

In [14]:
# Complete dataset causes kernel to crash.
# overall dataset has been subdivided into 4 smaller dataframes for processing

# data.iloc[0:4,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_0.csv', index=False)
data.iloc[0:131,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_1.csv', index=False)
data.iloc[132:262,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_2.csv', index=False)
data.iloc[263:393,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_3.csv', index=False)
data.iloc[394:524,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_4.csv', index=False)

In [15]:
# data0 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_0.csv')
data1 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_1.csv')
data2 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_2.csv')
data3 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_3.csv')
data4 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_4.csv')

In [16]:
# Creating a string from the 'text_concat_clean' list column
# list format will not work with spaCy
# must be in list format for vectorization

# data0['liststring'] = data0['text_concat_clean'].apply(lambda x: x[1:-1])
# data0['liststring'] = data0['liststring'].str.replace(',','')

data1['liststring'] = data1['text_concat_clean'].apply(lambda x: x[1:-1])
data1['liststring'] = data1['liststring'].str.replace(',','')

data2['liststring'] = data2['text_concat_clean'].apply(lambda x: x[1:-1])
data2['liststring'] = data2['liststring'].str.replace(',','')

data3['liststring'] = data3['text_concat_clean'].apply(lambda x: x[1:-1])
data3['liststring'] = data3['liststring'].str.replace(',','')

data4['liststring'] = data4['text_concat_clean'].apply(lambda x: x[1:-1])
data4['liststring'] = data4['liststring'].str.replace(',','')



# data1['liststring'] = [[' '.join(i)] for i in data1['text_concat_clean']]


# data1["liststring"]= data1["text_concat_clean"].str.join("")
# data1['liststring'] = " ".join(data1['text_concat_clean'])

# data0['liststring'] = [' '.join(map(str, l)) for l in data0['text_concat_clean']]

# data1['liststring'] = [' '.join(map(str, l)) for l in data1['text_concat_clean']]
# data2['liststring'] = [' '.join(map(str, l)) for l in data2['text_concat_clean']]
# data3['liststring'] = [' '.join(map(str, l)) for l in data3['text_concat_clean']]
# data4['liststring'] = [' '.join(map(str, l)) for l in data4['text_concat_clean']]

# data0.head(2)

In [None]:
# # Create a new column of each spaCy doc's vector

# data0['vectors'] = [nlp(doc).vector for doc in data0['liststring']]

# print(type(data0['vectors'][0]))

# data0.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653,"['great', 'welcome', 'SenSanders', 'CA44', 'ra...",'great' 'welcome' 'SenSanders' 'CA44' 'rally' ...,"[-0.1952472637594289, 0.39182178540036694, 0.0..."
1,1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609,"['new', 'apartment', 'new', 'business', 'new',...",'new' 'apartment' 'new' 'business' 'new' 'rest...,"[-0.207492859274217, 0.3944523199382951, 0.048..."


In [None]:
# # saving cleaned file to google drive
# # the file MUST be saved as .json due to .csv files dropping vector column formatting
# data0.to_json('drive/MyDrive/Python/tweet_vectors_0.json')

In [17]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 1
data1['vectors'] = [nlp(doc).vector for doc in data1['liststring']]

print(type(data1['vectors'][0]))

data1.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight The president had staged an elaborate ...,1623390,1623133,1623133,1498873,178283,"['tonight', 'president', 'stage', 'elaborate',...",'tonight' 'president' 'stage' 'elaborate' 'pho...,"[-0.20780781231007517, 0.3950706796592127, 0.0..."
1,1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476,1312539,1312539,1214291,154634,"['hell', 'Donald', 'Trump', 'think', 'teargass...",'hell' 'Donald' 'Trump' 'think' 'teargasse' 'p...,"[-0.20511710777676892, 0.3915297958984846, 0.0..."


In [18]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data1.to_json('drive/MyDrive/Python/tweet_vectors_1.json')

In [19]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 2
data2['vectors'] = [nlp(doc).vector for doc in data2['liststring']]

print(type(data2['vectors'][0]))

data2.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,132,SenDuckworth,IL,Senator,Tammy Duckworth,D,4140,Our military is one of the most diverse instit...,1425319,1425032,1425032,1330677,173828,"['military', 'diverse', 'institution', 'Americ...",'military' 'diverse' 'institution' 'America' '...,"[-0.20541483869458502, 0.3955614524775245, 0.0..."
1,133,RonWyden,OR,Senator,Ron Wyden,D,3056,Black lives matter The Fourth Amendment protec...,1091437,1091204,1091204,1020388,130845,"['black', 'life', 'matter', 'Fourth', 'Amendme...",'black' 'life' 'matter' 'Fourth' 'Amendment' '...,"[-0.2127892177515624, 0.3945155785525361, 0.05..."


In [20]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data2.to_json('drive/MyDrive/Python/tweet_vectors_2.json')

In [21]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 3
data3['vectors'] = [nlp(doc).vector for doc in data3['liststring']]

print(type(data3['vectors'][0]))

data3.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,263,RepDonaldPayne,NJ,Representative,"Donald Payne, Jr",D,1281,Since George Floyds murder Ive spoken with Con...,363694,363660,363660,338850,47314,"['George', 'floyds', 'murder', 'have', 'speak'...",'George' 'floyds' 'murder' 'have' 'speak' 'con...,"[-0.1994762665149686, 0.3998312948905332, 0.04..."
1,264,RepTipton,CO,Representative,Scott Tipton,R,832,RT SecPompeo It starts so soon For the first t...,256154,255966,255966,236396,28387,"['SecPompeo', 'start', 'soon', 'time', 'year',...",'SecPompeo' 'start' 'soon' 'time' 'year' 'Hong...,"[-0.19678552, 0.39389646, 0.046309564, -0.0980..."


In [22]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data3.to_json('drive/MyDrive/Python/tweet_vectors_3.json')

In [23]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 4
data4['vectors'] = [nlp(doc).vector for doc in data4['liststring']]

print(type(data4['vectors'][0]))

data4.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,394,RepEBJ,TX,Representative,Eddie Bernice,D,644,Lets be clear there is a distinct difference ...,189486,189453,189453,176304,22576,"['let', 'clear', 'distinct', 'difference', 'pr...",'let' 'clear' 'distinct' 'difference' 'protest...,"[-0.19536115, 0.39665082, 0.05266507, -0.08611..."
1,395,RepKenBuck,CO,Representative,Ken Buck,R,1291,RT RepAndyBiggsAZ In this context the question...,388472,388238,388238,357644,43495,"['RepAndyBiggsAZ', 'context', 'question', 'CDC...",'RepAndyBiggsAZ' 'context' 'question' 'CDC' 'o...,"[-0.2074937754667653, 0.39040758661797814, 0.0..."


In [24]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data4.to_json('drive/MyDrive/Python/tweet_vectors_4.json')