In [None]:
# Checking to see if GPU drive is active
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sun Apr 25 16:04:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Checking to see if GPU drive is active
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [1]:
# If running in google colab:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
# Download en_core_web_lg for google colab
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.1MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180944 sha256=14ac135795fb0c964288590b53c93dfc6a065dc7a8f5aff67c9a2d2ecf138b41
  Stored in directory: /tmp/pip-ephem-wheel-cache-zugwoddg/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import regex as re
# import unicodedata

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, pairwise_distances

import spacy
import en_core_web_lg
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Loading the large (sm) spaCy English model, pre-trained on web text data
# In order to use vectorization, you must use the large pipeline package
# https://spacy.io/usage/spacy-101
'''
"To make them compact and fast, spaCy’s small pipeline packages (all packages that end in sm) 
don’t ship with word vectors, and only include context-sensitive tensors... 
So in order to use real word vectors, you need to download a larger pipeline package."
'''
nlp = en_core_web_lg.load()

# max length must be increased due to large size of each document
nlp.max_length = 10000000 # or higher

In [6]:
# # Importing the finalized concatinated dataframe
# # Use this line if running locally
# data = pd.read_csv('../data/tweet_vectors.csv')

# If running in google colab, use this import line:
# Change to file location if necessary
# data = pd.read_csv('/content/drive/MyDrive/Python/tweet_vectors_1.csv')

data1 = pd.read_json('/content/drive/MyDrive/Python/tweet_vectors_1.json')
data2 = pd.read_json('/content/drive/MyDrive/Python/tweet_vectors_2.json')
data3 = pd.read_json('/content/drive/MyDrive/Python/tweet_vectors_3.json')
data4 = pd.read_json('/content/drive/MyDrive/Python/tweet_vectors_4.json')


In [8]:
data1.head()

Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653,"['great', 'welcome', 'SenSanders', 'CA44', 'ra...",'great' 'welcome' 'SenSanders' 'CA44' 'rally' ...,"[-0.19524726380000001, 0.3918217854, 0.0493305..."
1,1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609,"['new', 'apartment', 'new', 'business', 'new',...",'new' 'apartment' 'new' 'business' 'new' 'rest...,"[-0.2074928593, 0.3944523199, 0.0484009482, -0..."
2,2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609,223536,"['literally', 'easy', 'fix', 'save', 'life', '...",'literally' 'easy' 'fix' 'save' 'life' 'httpab...,"[-0.2089821815, 0.39251183640000004, 0.0458777..."
3,3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050,7085835,7085835,6576425,839968,"['Baltimore', 'student', 'kill', 'gun', 'schoo...",'Baltimore' 'student' 'kill' 'gun' 'school' 'y...,"[-0.21396224300000002, 0.3897273741, 0.0522410..."
4,4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,NetNeutralityReminder Obama used 1934 FCC rule...,958737,958292,958292,886492,112053,"['netneutralityreminder', 'Obama', 'FCC', 'rul...",'netneutralityreminder' 'Obama' 'FCC' 'rule' '...,"[-0.20934589990000002, 0.39374101650000004, 0...."


In [10]:
data = pd.concat([data1, data2, data3, data4], axis=0, sort=False)

# data = data.drop_duplicates()

data.head(3)

Unnamed: 0.1,Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length,text_length_1,text_length_2,text_length_3,word_length,text_concat_clean,liststring,vectors
0,0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome SenSanders to CA44 for a r...,1571519,1570974,1570974,1449495,180653,"['great', 'welcome', 'SenSanders', 'CA44', 'ra...",'great' 'welcome' 'SenSanders' 'CA44' 'rally' ...,"[-0.19524726380000001, 0.3918217854, 0.0493305..."
1,1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,New apartments new businesses and new restaura...,2801192,2800624,2800624,2598814,352609,"['new', 'apartment', 'new', 'business', 'new',...",'new' 'apartment' 'new' 'business' 'new' 'rest...,"[-0.2074928593, 0.3944523199, 0.0484009482, -0..."
2,2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,Literally the easiest fix we could make to sav...,1843540,1842810,1842810,1704609,223536,"['literally', 'easy', 'fix', 'save', 'life', '...",'literally' 'easy' 'fix' 'save' 'life' 'httpab...,"[-0.2089821815, 0.39251183640000004, 0.0458777..."


In [11]:
# keeping only certain columns that will be used
columns_keep =['screen_name','state','position','name','party','tweet_count','vectors']

data = data[columns_keep]

In [12]:
data.head(3)

Unnamed: 0,screen_name,state,position,name,party,tweet_count,vectors
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,"[-0.19524726380000001, 0.3918217854, 0.0493305..."
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"[-0.2074928593, 0.3944523199, 0.0484009482, -0..."
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"[-0.2089821815, 0.39251183640000004, 0.0458777..."


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 0 to 151
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   screen_name  623 non-null    object
 1   state        623 non-null    object
 2   position     623 non-null    object
 3   name         623 non-null    object
 4   party        623 non-null    object
 5   tweet_count  623 non-null    int64 
 6   vectors      623 non-null    object
dtypes: int64(1), object(6)
memory usage: 38.9+ KB


In [16]:
data['vectors'][0][0]

0    [-0.19524726380000001, 0.3918217854, 0.0493305...
0    [-0.1972977185, 0.4062525852, 0.04576505, -0.0...
0    [-0.19127933330000002, 0.386952534, 0.05601444...
0    [-0.19360094890000001, 0.3976558457, 0.0497050...
Name: vectors, dtype: object

In [18]:
len(data['vectors'][5])


4

In [19]:
data.shape

(623, 7)

In [20]:
# Define X and train, test, split
X = data['vectors']
X = np.matrix(X.tolist())

# X = pd.DataFrame(data = list(data['vectors']), columns = list(range(1, 301)))


# data_list = [nlp(doc).vector.reshape(1,-1) for doc in data]
# data_ = np.concatenate(data_list)

# data_.shape

# data_list = [nlp(doc).vector.reshape(1,-1) for doc in corpus]
# data = np.concatenate(data_list)

# X = data.reshape(1,-1)
# X.shape

# sent = sent.reshape(1,-1)
# sent.shape

# (300,)

In [21]:
X.shape

(623, 300)

In [31]:

# Instantiate and fit a gridsearch for a KMeans clustering model
km = KMeans()

params = {
    'n_clusters': [2, 3, 4, 5, 6, 7, 8, 9] #, 10] #, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
    # 'n_clusters': [25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
   
}

gs = GridSearchCV(km, params, cv=5)
gs.fit(X)

# Get the best estimator
print(gs.best_estimator_)
print('-----------------')
print(silhouette_score(X, gs.best_estimator_.labels_))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
-----------------
0.08684503348682494


In [32]:
# Create new column with KMeans labels and look at distribution of labels
data['km_labels'] = gs.best_estimator_.labels_
data['km_labels'].value_counts()

1    133
8    112
4     96
6     95
3     90
5     63
7     18
0     12
2      4
Name: km_labels, dtype: int64