

### The first step of any text processing task you should be cleaning the data in order to obtain better features. 

# Import Libraries

In [1]:
import pandas as pd
import csv
import numpy as np
import re
import string

In [12]:
# change the seeting of the cells
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

## Data Introduction

    This dataset contains a list of democrats and republican Tweets as of May 2018. 
    The dataset contains 86,460 tweets in total.
    
    Source:https://www.kaggle.com/kapastor/democratvsrepublicantweets/

# Read data file

In [3]:
tweets = pd.read_csv('democratVsRepublicanTweets.csv')

In [4]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L"
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY


## Removing Punctuation


It doesn’t add any extra information while treating text data. 
Therefore removing all instances of it will help us reduce the size of the data.

In [5]:
tweets['No_Punctuation'] = tweets['Tweet'].str.replace('[^\w\s]','')

In [6]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY


## Platform Specific Pre-processing


    For twitter we might need to do a different pre-processing steps. For example, do you want to keep user names not ? how about hashtags? etc


In [7]:
#start processing the tweet
def processPost(tweet):

    #Replace @username with empty string
    tweet = re.sub('@[^\s]+', ' ', tweet)
    
    #Replace RT with empty string
    tweet = re.sub('RT', ' ', tweet)
    
    #Convert www.* or https?://* to " "
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet)
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # remove punctuations
    tweet= tweet.translate(str.maketrans('', '', string.punctuation))

    return tweet

In [8]:
tweets["Clean tweet"] = tweets['Tweet'].apply(lambda x: processPost(x))

In [282]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House…
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,Winter Haven resident Alta Vista teacher is one of several recognized by for National Teacher Apprecia…
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18…
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,Meeting with Thanks for taking the time to meet with ED Marucci Guzman NALCABPolicy2018…
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,Hurricane season starts on June 1st Puerto Rico’s readinesswell 🤦🏼‍♂️😡😩


## Lower case


Help with avoiding having multiple copies of the same words. 
For example, while calculating the word count, ‘Analytics’ and ‘analytics’ will be taken as different words.

In [9]:
tweets['Clean tweet'] = tweets['Clean tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [10]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,today senate dems vote to savetheinternet proud to support similar netneutrality legislation here in the house…
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,winter haven resident alta vista teacher is one of several recognized by for national teacher apprecia…
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,noted that hurricane maria has left approximately 90 billion in damages congress has allocated about 18…
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,meeting with thanks for taking the time to meet with ed marucci guzman nalcabpolicy2018…
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,hurricane season starts on june 1st puerto rico’s readinesswell 🤦🏼‍♂️😡😩


## Removal of Stop Words


stop words (or commonly occurring words) should be removed from the text data. 
For this purpose, we can either create a list of stopwords ourselves or we can use predefined libraries.

In [285]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [286]:
tweets['Clean tweet'] = tweets['Clean tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [287]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,today senate dems vote savetheinternet proud support similar netneutrality legislation house…
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,winter resident alta vista teacher one several recognized national teacher apprecia…
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,noted hurricane maria left approximately 90 billion damages congress allocated 18…
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,meeting thanks taking time meet ed marucci guzman nalcabpolicy2018…
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,hurricane season starts june 1st puerto rico’s readinesswell 🤦🏼‍♂️😡😩


## Common word removal


We can  remove commonly occurring words from our text data beside stop words.
We can check the 10 most frequently occurring words in our text data then take call to remove or retain them.

In [288]:
freq = pd.Series(' '.join(tweets['Clean tweet']).split()).value_counts()[:20]

In [289]:
freq

amp          9088
today        7581
great        4492
us           3966
thank        3838
house        3746
tax          3476
bill         3164
day          2978
new          2788
act          2721
work         2610
time         2575
congress     2553
thanks       2413
people       2322
trump        2305
week         2250
president    2209
proud        2192
dtype: int64

In [290]:
## Now, let’s remove these words as their presence is not important 

In [291]:
freq = list(freq.index)
tweets['Clean tweet'] = tweets['Clean tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [292]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,senate dems vote savetheinternet support similar netneutrality legislation house…
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,winter resident alta vista teacher one several recognized national teacher apprecia…
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,noted hurricane maria left approximately 90 billion damages allocated 18…
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,meeting taking meet ed marucci guzman nalcabpolicy2018…
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,hurricane season starts june 1st puerto rico’s readinesswell 🤦🏼‍♂️😡😩


## Rare words removal


Because they’re so rare, the association between them and other words is dominated by noise. 
You can replace rare words with a more general form and then this will have higher counts.

In [293]:
freq = pd.Series(' '.join(tweets['Clean tweet']).split()).value_counts()[-10:]

In [294]:
freq

undertaking…    1
apprehend       1
intellect       1
crysta…         1
9pm11pm         1
soros’s         1
constl          1
“selective      1
mhhh            1
4133            1
dtype: int64

In [295]:
freq = list(freq.index)

In [296]:
tweets['Clean tweet'] = tweets['Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [297]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L"
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. Congress has allocated about $18…
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY


## Spelling Correction


spelling correction is a useful pre-processing step because this also will help us in reducing multiple copies of words. 
For example, “Analytics” and “analytcs” will be treated as different words even if they are used in the same sense.

In [13]:
from textblob import TextBlob

tweets['Tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

ModuleNotFoundError: No module named 'textblob'

## Tokenization

Tokenization refers to dividing the text into a sequence of words or sentences. 

In [257]:
tweets["Tokens"]= tweets['Clean tweet'].apply(lambda x: x.split())

In [258]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet,Tokens
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L","[Today,, Senate, Dems, vote, to, #SaveTheInternet., Proud, to, support, similar, #NetNeutrality, legislation, here, in, the, House…, https://t.co/n3tggDLU1L]"
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,"[RT, @WinterHavenSun:, Winter, Haven, resident, /, Alta, Vista, teacher, is, one, of, several, recognized, by, @RepDarrenSoto, for, National, Teacher, Apprecia…]"
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. \n\nCongress has allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. Congress has allocated about $18…,"[RT, @NBCLatino:, .@RepDarrenSoto, noted, that, Hurricane, Maria, has, left, approximately, $90, billion, in, damages., Congress, has, allocated, about, $18…]"
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,"[RT, @NALCABPolicy:, Meeting, with, @RepDarrenSoto, ., Thanks, for, taking, the, time, to, meet, with, @LatinoLeader, ED, Marucci, Guzman., #NALCABPolicy2018.…]"
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,"[RT, @Vegalteno:, Hurricane, season, starts, on, June, 1st;, Puerto, Rico’s, readiness...well, 🤦🏼‍♂️😡😩@Pwr4PuertoRico, @RepDarrenSoto, @EspaillatNY]"


In [259]:
TextBlob(tweets['Clean tweet'][1]).words

WordList(['RT', 'WinterHavenSun', 'Winter', 'Haven', 'resident', 'Alta', 'Vista', 'teacher', 'is', 'one', 'of', 'several', 'recognized', 'by', 'RepDarrenSoto', 'for', 'National', 'Teacher', 'Apprecia…'])

## Stemming

Stemming refers to the removal of suffices, like “ing”, “ly”, “s”, etc. 
by a simple rule-based approach. For this purpose, we will use PorterStemmer from the NLTK library.

In [260]:
from nltk.stem import PorterStemmer

st = PorterStemmer()
tweets['Tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    today, senat dem vote to #savetheinternet. proud to support similar #netneutr legisl here in the house… https://t.co/n3tggdlu1l     
1    RT @winterhavensun: winter haven resid / alta vista teacher is one of sever recogn by @repdarrensoto for nation teacher apprecia…   
2    RT @nbclatino: .@repdarrensoto note that hurrican maria ha left approxim $90 billion in damages. congress ha alloc about $18…       
3    RT @nalcabpolicy: meet with @repdarrensoto . thank for take the time to meet with @latinolead ED marucci guzman. #nalcabpolicy2018.…
4    RT @vegalteno: hurrican season start on june 1st; puerto rico’ readiness...wel 🤦🏼‍♂️😡😩@pwr4puertorico @repdarrensoto @espaillatni   
Name: Tweet, dtype: object

## Lemmatization

Lemmatization is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices. 
It makes use of the vocabulary and does a morphological analysis to obtain the root word. 

In [261]:
from textblob import Word

tweets['Tweet'] = tweets['Tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [262]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet,Tokens
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L","[Today,, Senate, Dems, vote, to, #SaveTheInternet., Proud, to, support, similar, #NetNeutrality, legislation, here, in, the, House…, https://t.co/n3tggDLU1L]"
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,"[RT, @WinterHavenSun:, Winter, Haven, resident, /, Alta, Vista, teacher, is, one, of, several, recognized, by, @RepDarrenSoto, for, National, Teacher, Apprecia…]"
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria ha left approximately $90 billion in damages. Congress ha allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. Congress has allocated about $18…,"[RT, @NBCLatino:, .@RepDarrenSoto, noted, that, Hurricane, Maria, has, left, approximately, $90, billion, in, damages., Congress, has, allocated, about, $18…]"
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,"[RT, @NALCABPolicy:, Meeting, with, @RepDarrenSoto, ., Thanks, for, taking, the, time, to, meet, with, @LatinoLeader, ED, Marucci, Guzman., #NALCABPolicy2018.…]"
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season start on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,"[RT, @Vegalteno:, Hurricane, season, starts, on, June, 1st;, Puerto, Rico’s, readiness...well, 🤦🏼‍♂️😡😩@Pwr4PuertoRico, @RepDarrenSoto, @EspaillatNY]"


## Part of Speech (POS) Tagging

In [263]:
import nltk

In [264]:
tweets['POS']=tweets['Tokens'].apply(lambda x: nltk.pos_tag(x))

In [265]:
tweets.head()

Unnamed: 0,Party,Handle,Tweet,No_Punctuation,Clean tweet,Tokens,POS
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L",Today Senate Dems vote to SaveTheInternet Proud to support similar NetNeutrality legislation here in the House httpstcon3tggDLU1L,"Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L","[Today,, Senate, Dems, vote, to, #SaveTheInternet., Proud, to, support, similar, #NetNeutrality, legislation, here, in, the, House…, https://t.co/n3tggDLU1L]","[(Today,, NNP), (Senate, NNP), (Dems, NNP), (vote, NN), (to, TO), (#SaveTheInternet., VB), (Proud, NNP), (to, TO), (support, VB), (similar, JJ), (#NetNeutrality, NN), (legislation, NN), (here, RB), (in, IN), (the, DT), (House…, NNP), (https://t.co/n3tggDLU1L, NN)]"
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,RT WinterHavenSun Winter Haven resident Alta Vista teacher is one of several recognized by RepDarrenSoto for National Teacher Apprecia,RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…,"[RT, @WinterHavenSun:, Winter, Haven, resident, /, Alta, Vista, teacher, is, one, of, several, recognized, by, @RepDarrenSoto, for, National, Teacher, Apprecia…]","[(RT, NNP), (@WinterHavenSun:, NNP), (Winter, NNP), (Haven, NNP), (resident, VBD), (/, NNP), (Alta, NNP), (Vista, NNP), (teacher, NN), (is, VBZ), (one, CD), (of, IN), (several, JJ), (recognized, VBN), (by, IN), (@RepDarrenSoto, NN), (for, IN), (National, NNP), (Teacher, NNP), (Apprecia…, NNP)]"
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria ha left approximately $90 billion in damages. Congress ha allocated about $18…,RT NBCLatino RepDarrenSoto noted that Hurricane Maria has left approximately 90 billion in damages \n\nCongress has allocated about 18,RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages. Congress has allocated about $18…,"[RT, @NBCLatino:, .@RepDarrenSoto, noted, that, Hurricane, Maria, has, left, approximately, $90, billion, in, damages., Congress, has, allocated, about, $18…]","[(RT, NNP), (@NBCLatino:, NNP), (.@RepDarrenSoto, NNP), (noted, VBD), (that, IN), (Hurricane, NNP), (Maria, NNP), (has, VBZ), (left, VBN), (approximately, RB), ($90, JJ), (billion, CD), (in, IN), (damages., NN), (Congress, NNP), (has, VBZ), (allocated, VBN), (about, IN), ($18…, NN)]"
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,RT NALCABPolicy Meeting with RepDarrenSoto Thanks for taking the time to meet with LatinoLeader ED Marucci Guzman NALCABPolicy2018,RT @NALCABPolicy: Meeting with @RepDarrenSoto . Thanks for taking the time to meet with @LatinoLeader ED Marucci Guzman. #NALCABPolicy2018.…,"[RT, @NALCABPolicy:, Meeting, with, @RepDarrenSoto, ., Thanks, for, taking, the, time, to, meet, with, @LatinoLeader, ED, Marucci, Guzman., #NALCABPolicy2018.…]","[(RT, NNP), (@NALCABPolicy:, NNP), (Meeting, NNP), (with, IN), (@RepDarrenSoto, NNP), (., .), (Thanks, NNS), (for, IN), (taking, VBG), (the, DT), (time, NN), (to, TO), (meet, VB), (with, IN), (@LatinoLeader, NNP), (ED, NNP), (Marucci, NNP), (Guzman., NNP), (#NALCABPolicy2018.…, NN)]"
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season start on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,RT Vegalteno Hurricane season starts on June 1st Puerto Ricos readinesswell Pwr4PuertoRico RepDarrenSoto EspaillatNY,RT @Vegalteno: Hurricane season starts on June 1st; Puerto Rico’s readiness...well 🤦🏼‍♂️😡😩@Pwr4PuertoRico @RepDarrenSoto @EspaillatNY,"[RT, @Vegalteno:, Hurricane, season, starts, on, June, 1st;, Puerto, Rico’s, readiness...well, 🤦🏼‍♂️😡😩@Pwr4PuertoRico, @RepDarrenSoto, @EspaillatNY]","[(RT, NNP), (@Vegalteno:, NNP), (Hurricane, NNP), (season, NN), (starts, VBZ), (on, IN), (June, NNP), (1st;, CD), (Puerto, NNP), (Rico’s, NNP), (readiness...well, VBP), (🤦🏼‍♂️😡😩@Pwr4PuertoRico, NNP), (@RepDarrenSoto, NNP), (@EspaillatNY, NN)]"
