In [296]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [297]:
## get tweets from our repo
data = pd.read_csv(r"../inputdata/cnnhealth.txt", sep="|", header=None, on_bad_lines='skip')

In [298]:
print("Data in its unedited state: ")
print(data)

Data in its unedited state: 
                       0                               1  \
0     576880531301801984  Sat Mar 14 23:00:11 +0000 2015   
1     576820122666471424  Sat Mar 14 19:00:08 +0000 2015   
2     576744652717461504  Sat Mar 14 14:00:15 +0000 2015   
3     576736754436304896  Sat Mar 14 13:28:52 +0000 2015   
4     576736614766010368  Sat Mar 14 13:28:18 +0000 2015   
...                  ...                             ...   
4040  239699936671854593  Sun Aug 26 12:24:52 +0000 2012   
4041  239504620710420480  Sat Aug 25 23:28:46 +0000 2012   
4042  239410205757145088  Sat Aug 25 17:13:35 +0000 2012   
4043  239386320416428032  Sat Aug 25 15:38:41 +0000 2012   
4044  239366825018806272  Sat Aug 25 14:21:12 +0000 2012   

                                                      2  
0     An abundance of online info can turn us into e...  
1     A plant-based diet that incorporates fish may ...  
2     It doesn't take much to damage your hearing at...  
3     RT @CNN: For

In [299]:
## Removing urls from our tweets. One potential way to remove non-meaningful information such as server locations from the url
## path. Another potential method could be adding appearance thresholds for certain words i.e. if B8VAgxHCYAETD6L.jpg does not
## appear more than once, then remove it from our pool. There may be some meaningful information lost when removing the
## url. Nearly all tweets have embedded links for cnn.com, so keeping those will not help us better classify/differentiate the
## tweets. However, some are for pbs, instagram, etc. so could prove meaningful as a differentiator for clustering.
data[2] = data[2].replace('(https?:\/\/)(?:([^\n ]*))?', '', regex=True)

In [300]:
## Take tweets column from dataframa and convert it to an array so we can preform vector operations on it
tweets = data[2].to_numpy()

In [301]:
print(tweets)

['An abundance of online info can turn us into e-hypochondriacs. Or, worse, lead us to neglect getting the care we need '
 'A plant-based diet that incorporates fish may be the key to preventing colorectal cancers:  '
 "It doesn't take much to damage your hearing at a sports bar or nightclub. That's why a billion people are at risk. "
 ... 'Ann Romney talks about her experience with MS '
 "Make sure your first marathon isn't your last! "
 "Robin Roberts' cancer diagnosis "]


In [309]:
## Create count vectorizer, passing the standard english stop words set as a parameter (removes words such as 'and', 'it' and punctuation, etc.)
vectorizer = CountVectorizer(stop_words='english')

In [311]:
vectorizer.fit(tweets)

CountVectorizer(stop_words='english')

In [312]:
## get array of unique words across all documents
names = vectorizer.get_feature_names_out()

In [305]:
get feature ve
features = vectorizer.transform(tweets)
features

<4045x7554 sparse matrix of type '<class 'numpy.int64'>'
	with 31043 stored elements in Compressed Sparse Row format>

In [306]:
featurearray = features.toarray()

In [307]:
print(featurearray)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [308]:
df = pd.DataFrame(featurearray, columns=names);

In [293]:
df.to_csv(r'../outputdata/featurematrix.csv', index=None)