# Data Cleaning and Vectorization For NLP

## Install and Import

In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

## Tokenization

In [4]:
import nltk

In [5]:
sample_text= "Oh man, this is pretty cool. We will do more such things. 2"

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [7]:
sentence_token = sent_tokenize(sample_text.lower())
sentence_token

['oh man, this is pretty cool.', 'we will do more such things.', '2']

In [8]:
word_token = word_tokenize(sample_text.lower())
word_token

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 '2']

## Removing Punctuation and Numbers

In [9]:
tokens_without_punc = [w for w in word_token if w.isalpha()] # .isalnum() for number and object
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things']

## Removing Stopwords

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\husey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords

In [12]:
stop_words = stopwords.words("english")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things']

In [14]:
token_without_sw = [t for t in tokens_without_punc if t not in stop_words] # if you make a sentiment analysis , you can't remove 
                                                                           # negative auxiliary verb
token_without_sw

['oh', 'man', 'pretty', 'cool', 'things']

## Data Normalization-Lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
# nltk.download('wordnet')

In [17]:
WordNetLemmatizer().lemmatize("drove", pos = "n")

'drove'

In [18]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [19]:
lem

['oh', 'man', 'pretty', 'cool', 'thing']

## Data Normalization-Stemming

In [20]:
from nltk.stem import PorterStemmer

In [21]:
PorterStemmer().stem("driving")

'drive'

In [22]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [23]:
stem

['oh', 'man', 'pretti', 'cool', 'thing']

## Joining

In [24]:
" ".join(lem)

'oh man pretty cool thing'

In [25]:
## Cleaning Function - for classification (NOT for sentiment analysis)
def cleaning(data):
    
    #1. Tokenize
    text_tokens = word_tokenize(data.lower()) 
    
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [26]:
pd.Series(sample_text).apply(cleaning)

0    oh man pretty cool thing
dtype: object

## Cleaning Function - for sentiment analysis

In [27]:
sample_text= "Oh man, this is pretty cool. We will do more such things. don't aren't are not. no problem"

In [28]:
s = sample_text.replace("'",'')
word = word_tokenize(s)
word 

['Oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'We',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 'dont',
 'arent',
 'are',
 'not',
 '.',
 'no',
 'problem']

In [29]:
for i in ["not", "no"]:
        stop_words.remove(i)

def cleaning_fsa(data):
    
    #1. removing upper brackets to keep negative auxiliary verbs in text
    text = data.replace("'",'')
         
    #2. Tokenize
    text_tokens = word_tokenize(text.lower()) 
    
    #3. Remove numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #4. Removing Stopwords     
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #5. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [30]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [31]:
pd.Series(sample_text).apply(cleaning_fsa)

0    oh man pretty cool thing dont arent not no pro...
dtype: object

## CountVectorization and TF-IDF Vectorization

In [32]:
df = pd.read_csv("airline_tweets.csv", encoding = 'unicode_escape')

In [33]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [34]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [35]:
df = df.iloc[:8, :]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [36]:
df.head(8)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [37]:
df2 = df.copy()

In [38]:
df2["text"] = df2["text"].apply(cleaning_fsa)
df2

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus youve added commercial expe...
2,neutral,virginamerica didnt today must mean need take ...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing
5,negative,virginamerica seriously would pay flight seat ...
6,positive,virginamerica yes nearly every time fly vx go ...
7,neutral,virginamerica really missed prime opportunity ...


## CountVectorization

In [39]:
X = df2["text"]
y = df2["airline_sentiment"]

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.5, stratify = y, random_state = 42)

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [46]:
vectorizer.get_feature_names_out()

array(['another', 'away', 'bad', 'big', 'dhepburn', 'didnt', 'every',
       'fly', 'go', 'mean', 'must', 'nearly', 'need', 'really', 'said',
       'take', 'thing', 'time', 'today', 'trip', 'virginamerica', 'vx',
       'yes'], dtype=object)

In [47]:
X_train_count.toarray()

array([[0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
        1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
        0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
        0]], dtype=int64)

In [49]:
df_count = pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names_out())
df_count

Unnamed: 0,another,away,bad,big,dhepburn,didnt,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,yes
0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,1
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
2,1,0,0,0,0,1,0,0,0,1,1,0,1,0,0,1,0,0,1,1,1,0,0
3,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0


In [50]:
X_train

6    virginamerica yes nearly every time fly vx go ...
0                          virginamerica dhepburn said
2    virginamerica didnt today must mean need take ...
4                   virginamerica really big bad thing
Name: text, dtype: object

In [51]:
X_train[6]

'virginamerica yes nearly every time fly vx go away'

In [52]:
vectorizer.vocabulary_

{'virginamerica': 20,
 'yes': 22,
 'nearly': 11,
 'every': 6,
 'time': 17,
 'fly': 7,
 'vx': 21,
 'go': 8,
 'away': 1,
 'dhepburn': 4,
 'said': 14,
 'didnt': 5,
 'today': 18,
 'must': 10,
 'mean': 9,
 'need': 12,
 'take': 15,
 'another': 0,
 'trip': 19,
 'really': 13,
 'big': 3,
 'bad': 2,
 'thing': 16}

## TF-IDF

sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

In [56]:
tf_idf_vectorizer.get_feature_names_out()

array(['another', 'away', 'bad', 'big', 'dhepburn', 'didnt', 'every',
       'fly', 'go', 'mean', 'must', 'nearly', 'need', 'really', 'said',
       'take', 'thing', 'time', 'today', 'trip', 'virginamerica', 'vx',
       'yes'], dtype=object)

In [57]:
X_train_tf_idf.toarray()

array([[0.        , 0.34768534, 0.        , 0.        , 0.        ,
        0.        , 0.34768534, 0.34768534, 0.34768534, 0.        ,
        0.        , 0.34768534, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.34768534, 0.        , 0.        ,
        0.18143663, 0.34768534, 0.34768534],
       [0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.34618161, 0.        , 0.        ],
       [0.34768534, 0.        , 0.        , 0.        , 0.        ,
        0.34768534, 0.        , 0.        , 0.        , 0.34768534,
        0.34768534, 0.        , 0.34768534, 0.        , 0.        ,
        0.34768534, 0.        , 0.        , 0.34768534, 0.34768534,
        0.18143663, 0.        , 0.        ],
       [0.        , 0.        , 0.48380259, 0.483

In [59]:
df_tfidf = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out())
df_tfidf

Unnamed: 0,another,away,bad,big,dhepburn,didnt,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,yes
0,0.0,0.347685,0.0,0.0,0.0,0.0,0.347685,0.347685,0.347685,0.0,0.0,0.347685,0.0,0.0,0.0,0.0,0.0,0.347685,0.0,0.0,0.181437,0.347685,0.347685
1,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.346182,0.0,0.0
2,0.347685,0.0,0.0,0.0,0.0,0.347685,0.0,0.0,0.0,0.347685,0.347685,0.0,0.347685,0.0,0.0,0.347685,0.0,0.0,0.347685,0.347685,0.181437,0.0,0.0
3,0.0,0.0,0.483803,0.483803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.252468,0.0,0.0


In [60]:
X_train[6]

'virginamerica yes nearly every time fly vx go away'

In [61]:
df_tfidf.loc[3].sort_values(ascending=False)

really           0.483803
bad              0.483803
big              0.483803
thing            0.483803
virginamerica    0.252468
another          0.000000
vx               0.000000
trip             0.000000
today            0.000000
time             0.000000
take             0.000000
said             0.000000
nearly           0.000000
need             0.000000
away             0.000000
must             0.000000
mean             0.000000
go               0.000000
fly              0.000000
every            0.000000
didnt            0.000000
dhepburn         0.000000
yes              0.000000
Name: 3, dtype: float64

In [65]:
pd.DataFrame(X_test_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out())

Unnamed: 0,another,away,bad,big,dhepburn,didnt,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.886548,0.0,0.0,0.0,0.0,0.0,0.0,0.462637,0.0,0.0
1,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.252468,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.886548,0.0,0.0,0.0,0.0,0.0,0.0,0.462637,0.0,0.0


In [66]:
X_test[3]

'virginamerica really aggressive blast obnoxious entertainment guest face amp little recourse'