In [3]:
# Checking to see if GPU drive is active
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [4]:
# Checking to see if GPU drive is active
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 38.0 gigabytes of available RAM

You are using a high-RAM runtime!


In [5]:
# If running in google colab:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install -U spacy

In [6]:
# !python -m spacy download en_core_web_sm

# Download en_core_web_lg for google colab
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import regex as re
# import unicodedata

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, pairwise_distances

import spacy
# import en_core_web_sm
import en_core_web_lg

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [8]:
# Loading the small (sm) spaCy English model, pre-trained on web text data
# nlp = en_core_web_sm.load()

# Loading the large (sm) spaCy English model, pre-trained on web text data
# In order to use vectorization, you must use the large pipeline package
# https://spacy.io/usage/spacy-101
'''
"To make them compact and fast, spaCy’s small pipeline packages (all packages that end in sm) 
don’t ship with word vectors, and only include context-sensitive tensors... 
So in order to use real word vectors, you need to download a larger pipeline package."
'''
nlp = en_core_web_lg.load()

# max length must be increased due to large size of each document
nlp.max_length = 10000000 # or higher


In [None]:
# # Importing the finalized concatinated dataframe
# # Use this line if running locally
# data = pd.read_csv('../data/concat_df.csv')

# If running in google colab, use this import line:
# Change to file location if necessary
data = pd.read_csv('drive/MyDrive/Python/concat_df.csv')

data.shape

(626, 8)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626 entries, 0 to 625
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   screen_name  626 non-null    object
 1   state        626 non-null    object
 2   position     626 non-null    object
 3   name         626 non-null    object
 4   party        626 non-null    object
 5   tweet_count  626 non-null    int64 
 6   text_concat  626 non-null    object
 7   text_length  626 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 39.2+ KB


In [None]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"New apartments, new businesses and new restaur...",2801192
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"Literally, the easiest fix we could make to sa...",1843540


In [None]:
data.isnull().sum()

screen_name    0
state          0
position       0
name           0
party          0
tweet_count    0
text_concat    0
text_length    0
dtype: int64

In [None]:
# making sure there are no spaces at start or end of cells
data['screen_name'] = [i.strip() for i in data['screen_name']]
data['state'] = [i.strip() for i in data['state']]
data['position'] = [i.strip() for i in data['position']]
data['name'] = [i.strip() for i in data['name']]
data['party'] = [i.strip() for i in data['party']]
data['text_concat'] = [i.strip() for i in data['text_concat']]

data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"New apartments, new businesses and new restaur...",2801192
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"Literally, the easiest fix we could make to sa...",1843540


In [None]:
# Removing all emojis
# HELP:  https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1#file-strip_emoji-py

remove_emojis = re.compile(
    "(["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "])"
  )

data['text_concat'] = [re.sub(remove_emojis, '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of emojis
data['text_length_1'] = data['text_concat'].apply(len)

In [None]:
# Removing hyperlinks from tweet text
# HELP: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
data['text_concat'] = [re.sub(r'^https?:\/\/.*[\r\n]*', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of hyperlinks
data['text_length_2'] = data['text_concat'].apply(len)

In [None]:
# Remove punctuation, special characters, and numbers
data['text_concat'] = [re.sub(r'[^a-zA-Z0-9 ]+', '', i) for i in data['text_concat']]

# creating a new column to see if length of text_concat has changed
# after the removal of any other special characters, punctuation, numbers
data['text_length_3'] = data['text_concat'].apply(len)

In [None]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609


In [None]:
# Count the number of words for each user
# HELP: https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe

data['word_length'] = data['text_concat'].str.split(' ').str.len()

In [None]:
data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609,223536
3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050,7085835,7085835,6576425,839968
4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,NetNeutralityReminder Obama used 1934 FCC rule...,958737,958292,958292,886492,112053


In [None]:
data['tweet_count'].describe()

count      626.000000
mean      2384.752396
std       2141.344896
min          6.000000
25%        949.250000
50%       1890.000000
75%       3164.500000
max      18325.000000
Name: tweet_count, dtype: float64

In [None]:
data['word_length'].describe()

count       626.000000
mean      88128.678914
std       82571.864571
min         227.000000
25%       33350.000000
50%       68001.000000
75%      118017.250000
max      839968.000000
Name: word_length, dtype: float64

In [None]:
data['text_length_3'].describe()

count    6.260000e+02
mean     6.949572e+05
std      6.524310e+05
min      1.461000e+03
25%      2.630642e+05
50%      5.425705e+05
75%      9.337525e+05
max      6.576425e+06
Name: text_length_3, dtype: float64

In [None]:
# See the list of spaCy stopwords
stopwords = list(STOP_WORDS)

print(f'SpaCy stop word list length: {len(stopwords)}')
print('--------------------------------')
print(f'SpaCy stop word list: {stopwords}')

SpaCy stop word list length: 326
--------------------------------
SpaCy stop word list: ['thru', 'several', 'yourself', 'less', '‘ll', 'whole', 'six', 'one', 'an', 'please', 'nobody', 'if', 'does', 'then', 'seems', 'with', 'at', 'would', 'between', 'see', 'thereupon', 'although', 'before', 'give', 'bottom', 'mostly', '’s', 'forty', 'into', 'throughout', 'back', 'few', 'yet', 'to', 'per', 'as', 'something', 'himself', 'much', '’m', 'down', 'doing', 'within', 'my', 'front', 'which', 'else', 'over', "'ll", 'anyhow', 'various', 'has', 'how', 'yours', 'others', 'unless', 'out', 'sixty', 'across', 'there', 'hereafter', 'noone', 'full', 'therein', 'whereby', 'whoever', 'ten', 'onto', '‘m', 'nine', 'a', 'cannot', 'whose', 'still', 'any', 'their', 'afterwards', 'yourselves', 'make', 'together', 'elsewhere', 'even', 'ever', '’re', 'these', 'mine', 'somehow', 'about', 'meanwhile', 'however', 'such', 'becomes', 'because', 'she', 'amount', 'hereby', 'above', 'you', 'is', 'on', 'thereby', 'many', 's

In [None]:
# contractions in the list of spaCy stop words
# HELP understanding .isalpha(): https://www.geeksforgeeks.org/python-string-isalpha-application/
contraction_parts = [word for word in stopwords if not word.isalpha()]

print(len(contraction_parts))
print('--------------------------------')
print(contraction_parts)

21
--------------------------------
['‘ll', '’s', '’m', "'ll", '‘m', '’re', '’ve', "'m", '‘s', "'d", "'ve", "'s", '‘ve', "'re", '’ll', 'n‘t', '’d', '‘re', 'n’t', '‘d', "n't"]


In [None]:
# Remove contractions to use w/ CountVectorizer
stopwords_full_words = [word for word in stopwords if word.isalpha()]

print(len(stopwords_full_words))
print('--------------------------------')
print(stopwords_full_words)

305
--------------------------------
['thru', 'several', 'yourself', 'less', 'whole', 'six', 'one', 'an', 'please', 'nobody', 'if', 'does', 'then', 'seems', 'with', 'at', 'would', 'between', 'see', 'thereupon', 'although', 'before', 'give', 'bottom', 'mostly', 'forty', 'into', 'throughout', 'back', 'few', 'yet', 'to', 'per', 'as', 'something', 'himself', 'much', 'down', 'doing', 'within', 'my', 'front', 'which', 'else', 'over', 'anyhow', 'various', 'has', 'how', 'yours', 'others', 'unless', 'out', 'sixty', 'across', 'there', 'hereafter', 'noone', 'full', 'therein', 'whereby', 'whoever', 'ten', 'onto', 'nine', 'a', 'cannot', 'whose', 'still', 'any', 'their', 'afterwards', 'yourselves', 'make', 'together', 'elsewhere', 'even', 'ever', 'these', 'mine', 'somehow', 'about', 'meanwhile', 'however', 'such', 'becomes', 'because', 'she', 'amount', 'hereby', 'above', 'you', 'is', 'on', 'thereby', 'many', 'same', 'that', 'myself', 'next', 'nowhere', 'done', 'go', 'three', 'move', 'whether', 'here

In [None]:
data.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609,223536
3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050,7085835,7085835,6576425,839968
4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,NetNeutralityReminder Obama used 1934 FCC rule...,958737,958292,958292,886492,112053


In [None]:
# Find the 50 most common words in my corpus 
X = data['text_concat']

cvec = CountVectorizer(max_features=50, stop_words=stopwords_full_words)

cvec.fit(X);

In [None]:
print(cvec.get_feature_names())

['act', 'american', 'americans', 'amp', 'bill', 'bipartisan', 'care', 'communities', 'community', 'congress', 'continue', 'country', 'covid19', 'day', 'families', 'federal', 'great', 'health', 'help', 'house', 'im', 'important', 'like', 'national', 'need', 'new', 'people', 'president', 'protect', 'proud', 'public', 'qt', 'realdonaldtrump', 'right', 'rt', 'senate', 'state', 'support', 'thank', 'time', 'today', 'trump', 'vote', 'week', 'women', 'work', 'workers', 'working', 'year', 'years']


In [None]:
# should I remove @tweet_handles?

In [None]:
# Adding some common words from above code
custom_stopwords = ['rt', 'qt', 'im']
for word in custom_stopwords:
    stopwords.append(word)

# Verify function worked & words were appended
print(stopwords[-len(custom_stopwords):])
print('-----------')
print(stopwords)

['rt', 'qt', 'im']
-----------
['thru', 'several', 'yourself', 'less', '‘ll', 'whole', 'six', 'one', 'an', 'please', 'nobody', 'if', 'does', 'then', 'seems', 'with', 'at', 'would', 'between', 'see', 'thereupon', 'although', 'before', 'give', 'bottom', 'mostly', '’s', 'forty', 'into', 'throughout', 'back', 'few', 'yet', 'to', 'per', 'as', 'something', 'himself', 'much', '’m', 'down', 'doing', 'within', 'my', 'front', 'which', 'else', 'over', "'ll", 'anyhow', 'various', 'has', 'how', 'yours', 'others', 'unless', 'out', 'sixty', 'across', 'there', 'hereafter', 'noone', 'full', 'therein', 'whereby', 'whoever', 'ten', 'onto', '‘m', 'nine', 'a', 'cannot', 'whose', 'still', 'any', 'their', 'afterwards', 'yourselves', 'make', 'together', 'elsewhere', 'even', 'ever', '’re', 'these', 'mine', 'somehow', 'about', 'meanwhile', 'however', 'such', 'becomes', 'because', 'she', 'amount', 'hereby', 'above', 'you', 'is', 'on', 'thereby', 'many', 'same', 'that', 'myself', 'next', 'nowhere', 'done', 'go', 

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626 entries, 0 to 625
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   screen_name    626 non-null    object
 1   state          626 non-null    object
 2   position       626 non-null    object
 3   name           626 non-null    object
 4   party          626 non-null    object
 5   tweet_count    626 non-null    int64 
 6   text_concat    626 non-null    object
 7   text_length    626 non-null    int64 
 8   text_length_1  626 non-null    int64 
 9   text_length_2  626 non-null    int64 
 10  text_length_3  626 non-null    int64 
 11  word_length    626 non-null    int64 
dtypes: int64(6), object(6)
memory usage: 58.8+ KB


In [None]:
# Function to to remove custom stop words list 
def remove_stopwords(doc):
    no_stopwords_doc = []
    for token in doc:
        if token.text.lower() not in stopwords:
            no_stopwords_doc.append(token)
    return no_stopwords_doc

In [None]:
# Function to only keep nouns, verbs, adjectives, adverbs
def keep_nous_verbs_adj_adv(token_list):
    keep_nvaa = [token for token in token_list if token.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']]
    return keep_nvaa

In [None]:
# Function to lemmatize the tokens into a list of strings
def lemmatizer(token_list):
    lem_tokens = [token.lemma_ for token in token_list]
    return lem_tokens

In [None]:
# Function to return a list of Tokens
def process_text(text):
    text = nlp(text)
    token_list = remove_stopwords(text)
    token_list = keep_nous_verbs_adj_adv(token_list) 
    final_string = lemmatizer(token_list)
    return final_string

In [None]:
# Define a function to turn a list of tokens into a spaCy Doc
def make_spacy_doc(final_string):
    spacy_doc = nlp(' '.join(final_string))
    return spacy_doc

In [None]:
# Create a new column of processed spaCy docs
# call process text function to invoke all functions
data['text_concat_clean'] = [process_text(text_string) for text_string in data['text_concat']]

data.head()

<class 'list'>


Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653,"[great, welcome, SenSanders, CA44, rally, stop..."
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609,"[new, apartment, new, business, new, restauran..."
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609,223536,"[literally, easy, fix, save, life, httpabsolut..."
3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050,7085835,7085835,6576425,839968,"[Baltimore, student, kill, gun, school, year, ..."
4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,NetNeutralityReminder Obama used 1934 FCC rule...,958737,958292,958292,886492,112053,"[netneutralityreminder, Obama, FCC, rule, appl..."


In [None]:
# saving cleaned file to google drive
data.to_csv('drive/MyDrive/Python/concat_df_clean_lg.csv')

In [50]:
# read in saved data in order to vectorize it
data = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg.csv')

In [51]:
# Complete dataset causes kernel to crash.
# overall dataset has been subdivided into 4 smaller dataframes for processing

# data.iloc[0:4,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_0.csv', index=False)
data.iloc[0:157,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_1.csv', index=False)
data.iloc[158:315,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_2.csv', index=False)
data.iloc[316:473,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_3.csv', index=False)
data.iloc[474:626,:].to_csv('drive/MyDrive/Python/concat_df_clean_lg_4.csv', index=False)

In [52]:
# data0 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_0.csv')
data1 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_1.csv')
data2 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_2.csv')
data3 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_3.csv')
data4 = pd.read_csv('drive/MyDrive/Python/concat_df_clean_lg_4.csv')

In [53]:
# Creating a string from the 'text_concat_clean' list column
# list format will not work with spaCy
# must be in list format for vectorization

# data0['liststring'] = data0['text_concat_clean'].apply(lambda x: x[1:-1])
# data0['liststring'] = data0['liststring'].str.replace(',','')

data1['liststring'] = data1['text_concat_clean'].apply(lambda x: x[1:-1])
data1['liststring'] = data1['liststring'].str.replace(',','')

data2['liststring'] = data2['text_concat_clean'].apply(lambda x: x[1:-1])
data2['liststring'] = data2['liststring'].str.replace(',','')

data3['liststring'] = data3['text_concat_clean'].apply(lambda x: x[1:-1])
data3['liststring'] = data3['liststring'].str.replace(',','')

data4['liststring'] = data4['text_concat_clean'].apply(lambda x: x[1:-1])
data4['liststring'] = data4['liststring'].str.replace(',','')



# data1['liststring'] = [[' '.join(i)] for i in data1['text_concat_clean']]


# data1["liststring"]= data1["text_concat_clean"].str.join("")
# data1['liststring'] = " ".join(data1['text_concat_clean'])

# data0['liststring'] = [' '.join(map(str, l)) for l in data0['text_concat_clean']]

# data1['liststring'] = [' '.join(map(str, l)) for l in data1['text_concat_clean']]
# data2['liststring'] = [' '.join(map(str, l)) for l in data2['text_concat_clean']]
# data3['liststring'] = [' '.join(map(str, l)) for l in data3['text_concat_clean']]
# data4['liststring'] = [' '.join(map(str, l)) for l in data4['text_concat_clean']]

# data0.head(2)

Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653,"['great', 'welcome', 'SenSanders', 'CA44', 'ra...",'great' 'welcome' 'SenSanders' 'CA44' 'rally' ...,"[-0.1952472637594289, 0.39182178540036694, 0.0..."
1,1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609,"['new', 'apartment', 'new', 'business', 'new',...",'new' 'apartment' 'new' 'business' 'new' 'rest...,"[-0.207492859274217, 0.3944523199382951, 0.048..."


In [46]:
# # Create a new column of each spaCy doc's vector

# data0['vectors'] = [nlp(doc).vector for doc in data0['liststring']]

# print(type(data0['vectors'][0]))

# data0.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653,"['great', 'welcome', 'SenSanders', 'CA44', 'ra...",'great' 'welcome' 'SenSanders' 'CA44' 'rally' ...,"[-0.1952472637594289, 0.39182178540036694, 0.0..."
1,1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609,"['new', 'apartment', 'new', 'business', 'new',...",'new' 'apartment' 'new' 'business' 'new' 'rest...,"[-0.207492859274217, 0.3944523199382951, 0.048..."


In [None]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data0.to_json('drive/MyDrive/Python/tweet_vectors_0.json')

In [None]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 1
data1['vectors'] = [nlp(doc).vector for doc in data1['liststring']]

print(type(data1['vectors'][0]))

data1.head(2)

In [None]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data1.to_json('drive/MyDrive/Python/tweet_vectors_1.json')

In [54]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 2
data2['vectors'] = [nlp(doc).vector for doc in data2['liststring']]

print(type(data2['vectors'][0]))

data2.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,158,SenDougJones,AL,Senator,Doug Jones,D,1760,Still waiting to hear back from the bhambarons...,563716,563533,563533,521567,68069,"['wait', 'hear', 'bhambaron', 'spot', 'rotatio...",'wait' 'hear' 'bhambaron' 'spot' 'rotation' 'f...,"[-0.1972977185165926, 0.4062525852084712, 0.04..."
1,159,SenatorCardin,MD,Senator,Benjamin Cardin,D,2818,Thank you to all the partners who made this po...,962541,962448,962448,899387,119618,"['thank', 'partner', 'possible', 'develop', 'g...",'thank' 'partner' 'possible' 'develop' 'good' ...,"[-0.20329654665282496, 0.39419712725714107, 0...."


In [55]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data2.to_json('drive/MyDrive/Python/tweet_vectors_2.json')

In [56]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 3
data3['vectors'] = [nlp(doc).vector for doc in data3['liststring']]

print(type(data3['vectors'][0]))

data3.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,316,SenDanSullivan,AK,Senator,Dan Sullivan,R,1560,Welcome home to JBERs 425 The Spartan Brigade ...,522905,522531,522531,487221,63572,"['welcome', 'home', 'jber', 'Spartan', 'Brigad...",'welcome' 'home' 'jber' 'Spartan' 'Brigade' 'o...,"[-0.1912793333453031, 0.386952533994063, 0.056..."
1,317,RepRickAllen,GA,Representative,Rick Allen,R,1220,With unemployment at an EIGHTEEN year low our ...,362507,362437,362437,336407,41820,"['unemployment', 'EIGHTEEN', 'year', 'low', 'r...",'unemployment' 'EIGHTEEN' 'year' 'low' 'republ...,"[-0.19739281317865115, 0.3998015770155939, 0.0..."


In [57]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data3.to_json('drive/MyDrive/Python/tweet_vectors_3.json')

In [58]:
# Create a new column of each spaCy doc's vector
# Processing sub-dataframe chunk 4
data4['vectors'] = [nlp(doc).vector for doc in data4['liststring']]

print(type(data4['vectors'][0]))

data4.head(2)

<class 'numpy.ndarray'>


Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,474,CongPalazzo,MS,Representative,Steven Palazzo,R,2496,As Mississippi reopens we should still do all ...,787709,786846,786846,725544,90501,"['Mississippi', 'reopen', 'protect', 'health',...",'Mississippi' 'reopen' 'protect' 'health' 'saf...,"[-0.19360094894341023, 0.39765584571440243, 0...."
1,475,RepMarkGreen,TN,Representative,Mark Green,R,2870,Anarchy is not equal to freedom Anarchy is a t...,936960,936485,936485,865887,106606,"['anarchy', 'equal', 'freedom', 'anarchy', 'th...",'anarchy' 'equal' 'freedom' 'anarchy' 'threat'...,"[-0.2089500012087487, 0.39175376067361734, 0.0..."


In [59]:
# saving cleaned file to google drive
# the file MUST be saved as .json due to .csv files dropping vector column formatting
data4.to_json('drive/MyDrive/Python/tweet_vectors_4.json')