# Text Mining with tweets ( Hillary vs Trump ) and PCA

### Exploring the dataset

In [0]:
import pandas as pd 
df = pd.read_csv('tweets.csv')

In [0]:
df.columns

Index(['id', 'handle', 'text', 'is_retweet', 'original_author', 'time',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'is_quote_status', 'lang', 'retweet_count',
       'favorite_count', 'longitude', 'latitude', 'place_id',
       'place_full_name', 'place_name', 'place_type', 'place_country_code',
       'place_country', 'place_contained_within', 'place_attributes',
       'place_bounding_box', 'source_url', 'truncated', 'entities',
       'extended_entities'],
      dtype='object')

In [0]:
df['handle'].unique()

array(['HillaryClinton', 'realDonaldTrump'], dtype=object)

In [0]:
df['text'][4]

"Both candidates were asked about how they'd confront racial injustice. Only one had a real answer. https://t.co/sjnEokckis"

In [0]:
df['text'].head()

0    The question in this election: Who can put the...
1    Last night, Donald Trump said not paying taxes...
2    Couldn't be more proud of @HillaryClinton. Her...
3    If we stand together, there's nothing we can't...
4    Both candidates were asked about how they'd co...
Name: text, dtype: object

In [0]:
df['text'][1]

'Last night, Donald Trump said not paying taxes was "smart." You know what I call it? Unpatriotic. https://t.co/t0xmBfj7zF'

### Getting rid of trailing urls that will perturb our text minding in the dataset

In [0]:
import re

In [0]:
re.sub('http://\S+|https://\S+','',df['text'][1])

'Last night, Donald Trump said not paying taxes was "smart." You know what I call it? Unpatriotic. '

### Transforming all Tweets into a nice dataframe

In [0]:
tweet_text=[]
clinton_trump_musk=[]
for i in range(len(df)):
  tweet_text.append(re.sub('http://\S+|https://\S+','',df['text'][i]))
  if df.loc[i]['handle'] == 'realDonaldTrump':
    clinton_trump.append(0)
  if df.loc[i]['handle'] == 'HillaryClinton':
    clinton_trump.append(1)

In [0]:
nos_tweets = pd.DataFrame({ 'text' : tweet_text, 'c_t' : clinton_trump })

In [0]:
import numpy as np
nos_tweets_ale = nos_tweets.reindex(np.random.permutation(nos_tweets.index)).copy()

In [0]:
nos_tweets_ale

Unnamed: 0,text,c_t
2083,"""Hillary Clinton may be our 1st woman presiden...",1
3491,"If you’re too dangerous to get on a plane, you...",1
2043,"""No major party nominee in the history of this...",1
3766,WATCH NOW: The @realDonaldTrump supporters you...,0
1324,“@TeamUSA is showing the world what this count...,1
...,...,...
3899,Gold Star families like Betsy’s have paid the ...,1
2676,Mike Pence's battle against Planned Parenthood...,1
1698,Join me in Florida tomorrow! \n#MakeAmericaGre...,0
6048,Thank you for your interest &amp; support duri...,0


### creating a trainingset from our tweets

In [0]:
tweets_tr = nos_tweets_ale[:4444].copy()

In [0]:
tweets_tr

Unnamed: 0,text,c_t
2083,"""Hillary Clinton may be our 1st woman presiden...",1
3491,"If you’re too dangerous to get on a plane, you...",1
2043,"""No major party nominee in the history of this...",1
3766,WATCH NOW: The @realDonaldTrump supporters you...,0
1324,“@TeamUSA is showing the world what this count...,1
...,...,...
6427,"""@Kacee50: @realDonaldTrump Women make up the ...",0
3011,Just like he shouldn’t have his finger on the ...,1
4716,"Voters in CT, DE, MD, PA, and RI head to the p...",1
1786,I turned down a meeting with Charles and David...,0


### Creating a testset from our tweets

In [0]:
tweets_te = nos_tweets_ale[4444:].copy()

In [0]:
tweets_te

Unnamed: 0,text,c_t
227,Tomorrow's the day! Knock on doors and make ca...,0
2563,1973: \n \nDonald Trump is sued for housing di...,1
3606,"""I’ve seen her judgment, toughness, and commit...",1
3268,They are who we're fighting for #NotOneMore,1
857,Unions stand up for workers who need someone t...,1
...,...,...
3899,Gold Star families like Betsy’s have paid the ...,1
2676,Mike Pence's battle against Planned Parenthood...,1
1698,Join me in Florida tomorrow! \n#MakeAmericaGre...,0
6048,Thank you for your interest &amp; support duri...,0


### Transforming Texts into vectors with CountVectorizer and TfidfTransfomer
CountVectorizer Vector of the word dictionary of the whole dataset, each sample is then represented as a single vector contaning the counts of word appearances

Tfidf transformer rescales counts according to Tf and Idf formulas

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [0]:
X = tweets_tr['text']
Y = tweets_tr['c_t']
count_vect = CountVectorizer()

In [0]:
count_vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
X_counts = count_vect.transform(X)

In [0]:
X_counts

<4444x7642 sparse matrix of type '<class 'numpy.int64'>'
	with 67807 stored elements in Compressed Sparse Row format>

In [0]:
tf_transformer = TfidfTransformer(use_idf=False)

In [0]:
tf_transformer.fit(X_counts)
X_tf = tf_transformer.transform(X_counts)

In [0]:
X_tf

<4444x7642 sparse matrix of type '<class 'numpy.float64'>'
	with 67807 stored elements in Compressed Sparse Row format>

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [0]:
clf = MultinomialNB()

In [0]:
clf.fit(X_tf,Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### building a pipeline of a classifier with all transformations

In [0]:
from sklearn.pipeline import Pipeline

In [0]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', MultinomialNB())])



In [0]:
text_clf.fit(X,Y)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=False)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

### Testing our pipeline with cross validation

In [0]:
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split

In [0]:
cv = ShuffleSplit(n_splits=50, test_size=0.3)
score = cross_val_score(text_clf,X,Y,cv=cv)

In [0]:
score.mean()

0.8930284857571215

In [0]:
score.std()*2

0.01505011420010093

In [0]:
X_final, X_p, y_final, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [0]:
text_clf.fit(X_final,y_final)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=False)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [0]:
text_clf.predict()

array([0])

### Performin Principal Component Ananlyses ( Dimensionality Reduction ) on a dataset

In [0]:
from sklearn.decomposition import PCA

In [0]:
import pandas as pd
df = pd.read_csv('train_data_cleaned.csv')

In [0]:
pca_3d = PCA(n_components=3)

In [0]:
pca_3d.fit(df)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [0]:
pca_3d.singular_values_

array([7.53458007e+08, 1.64781329e+03, 7.71820065e+02])

In [0]:
X_pca = pca_3d.transform(df)

In [0]:
X_pca

array([[-5.82144965e+07,  2.05866887e+02,  3.46991474e+01],
       [ 3.26783035e+07,  2.03203021e+02,  1.60511729e+01],
       [-2.80608965e+07,  2.02006729e+02,  1.13023112e+01],
       [-1.50144965e+07,  1.98967170e+02, -1.44488082e+00],
       [-5.64000965e+07,  1.99782656e+02,  3.83420718e+01],
       [ 1.10438304e+08,  1.92238886e+02, -1.49338530e+00],
       [-3.23808965e+07,  1.96006576e+02, -1.13522313e+01],
       [-5.49312965e+07,  1.94509234e+02, -5.55001225e+00],
       [ 1.78175035e+07,  1.88539761e+02,  5.68904245e+01],
       [ 7.71743035e+07,  1.86562792e+02,  5.25799984e+01],
       [ 2.48159035e+07,  1.86809439e+02,  5.33569741e+01],
       [-8.70729648e+06,  1.88261542e+02,  8.10416793e+00],
       [ 1.05503518e+05,  1.83794526e+02, -3.27574147e+00],
       [ 1.47071035e+07,  1.81055076e+02, -1.55890821e+01],
       [-6.18432965e+07,  1.83564075e+02, -4.46098212e+01],
       [ 2.22239035e+07,  1.78840314e+02,  3.45549796e+01],
       [-5.31168965e+07,  1.80996114e+02