# data cleaning and vectorization

# install and import

In [168]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

In [169]:
# pip install nltk

In [170]:
import nltk

In [171]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hbpbn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hbpbn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hbpbn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hbpbn\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [172]:
sample_text = "Oh man, this is pretty cool. We will do more such things. 2 ½ % ()"

In [173]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [174]:
sentence_token = sent_tokenize(sample_text.lower())

In [175]:
sentence_token

['oh man, this is pretty cool.', 'we will do more such things.', '2 ½ % ()']

In [176]:
word_token = word_tokenize(sample_text.lower())
word_token

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 '2',
 '½',
 '%',
 '(',
 ')']

In [177]:
#  removing punctiation
tokens_without_punc = [ i for i in word_token if i.isalpha()]
tokens_without_punc

['oh',
 'man',
 'this',
 'is',
 'pretty',
 'cool',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things']

# removing stopwords

In [178]:
from nltk.corpus import stopwords

In [179]:
stop_words = stopwords.words("english")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [180]:
tokens_without_sw = [i  for i in tokens_without_punc if i not in stop_words ]
tokens_without_sw

['oh', 'man', 'pretty', 'cool', 'things']

# data normalization - Lemmatization

In [181]:
from nltk.stem import WordNetLemmatizer

In [182]:
WordNetLemmatizer().lemmatize("drive")

'drive'

In [183]:
WordNetLemmatizer().lemmatize("driver")

'driver'

In [184]:
lem = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
lem

['oh', 'man', 'pretty', 'cool', 'thing']

In [185]:
# data normalization stemming

In [186]:
from nltk.stem import PorterStemmer

In [187]:
PorterStemmer().stem("children")

'children'

In [188]:
WordNetLemmatizer().lemmatize("children")

'child'

In [189]:
stem = [PorterStemmer().stem(t) for t in tokens_without_sw]

In [190]:
stem

['oh', 'man', 'pretti', 'cool', 'thing']

In [191]:
" ".join(lem)

'oh man pretty cool thing'

In [192]:
def cleaning(data):
    text_tokens = word_tokenize(data.lower())
    
    token_without_punch = [i for i in text_tokens if i.isalpha()]
    
    token_without_sw = [i for i in token_without_punch if i not in stop_words]
    
    text_cleaned = [WordNetLemmatizer().lemmatize(i) for i in token_without_sw]
    
    return " ".join(text_cleaned)
    

In [193]:
pd.Series(sample_text).apply(cleaning)

0    oh man pretty cool thing
dtype: object

# Sentiment Analysis

In [194]:
sample_text= "Oh man, this is pretty cool. We will do more such things. don't aren't are not. no problem"

In [195]:
s = sample_text.replace("'", "")
s

'Oh man, this is pretty cool. We will do more such things. dont arent are not. no problem'

In [196]:
word = word_tokenize(s.lower())
word

['oh',
 'man',
 ',',
 'this',
 'is',
 'pretty',
 'cool',
 '.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things',
 '.',
 'dont',
 'arent',
 'are',
 'not',
 '.',
 'no',
 'problem']

In [197]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [198]:
cleaning_words = [i for i in word if i not in stop_words]
cleaning_words

['oh',
 'man',
 ',',
 'pretty',
 'cool',
 '.',
 'things',
 '.',
 'dont',
 'arent',
 '.',
 'problem']

In [199]:
for i in ["not", "no"]:
    stop_words.remove(i)
    
def cleaning_fsa(data):
    # 1.step --- removing apostrophe
    text = data.replace("'", "")
    
    # 2.step --- tokenize
    text_token = word_tokenize(text.lower())
    
    # 3.step --- removing punctiations
    text_without_punc = [i for i in text_token if i.isalpha()]
    
    # 4.step --- removing stopwords 
    text_without_sw = [i for i in text_without_punc if i not in stop_words]
    
    # 5. step -- Lemmatization
    text_cleaned= [WordNetLemmatizer().lemmatize(i) for i in text_without_sw]
    
    # 6.step --- joining
    return " ".join(text_cleaned)

In [200]:
np.array(pd.Series(sample_text).apply(cleaning_fsa))

array(['oh man pretty cool thing dont arent not no problem'], dtype=object)

# CountVectorization and TF-IDF Vectorization

In [201]:
df = pd.read_csv("airline_tweets.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [202]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [203]:
df = df.head(8)
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [204]:
df2 = df.copy()
df2

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [205]:
df2["text"] = df2["text"].apply(cleaning_fsa)
df2["text"]

0                          virginamerica dhepburn said
1    virginamerica plus youve added commercial expe...
2    virginamerica didnt today must mean need take ...
3    virginamerica really aggressive blast obnoxiou...
4                   virginamerica really big bad thing
5    virginamerica seriously would pay flight seat ...
6    virginamerica yes nearly every time fly vx ear...
7    virginamerica really missed prime opportunity ...
Name: text, dtype: object

In [206]:
# Count Vectorization

In [207]:
X = df2["text"]
y = df2["airline_sentiment"]

In [208]:
from sklearn.model_selection import train_test_split

In [209]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, stratify = y, random_state = 42)

In [210]:
from sklearn.feature_extraction.text import CountVectorizer

In [211]:
X_train

6    virginamerica yes nearly every time fly vx ear...
0                          virginamerica dhepburn said
2    virginamerica didnt today must mean need take ...
4                   virginamerica really big bad thing
Name: text, dtype: object

In [212]:
vectorizer =CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [213]:
vectorizer.get_feature_names_out()

array(['another', 'away', 'bad', 'big', 'dhepburn', 'didnt', 'ear',
       'every', 'fly', 'go', 'mean', 'must', 'nearly', 'need', 'really',
       'said', 'take', 'thing', 'time', 'today', 'trip', 'virginamerica',
       'vx', 'worm', 'yes'], dtype=object)

In [214]:
X_train_count.toarray()

array([[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
        0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0]], dtype=int64)

In [216]:
df_train_count = pd.DataFrame(X_train_count.toarray(), columns=vectorizer.get_feature_names_out(), index=X_train.index)

In [217]:
df_train_count

Unnamed: 0,another,away,bad,big,dhepburn,didnt,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
6,0,1,0,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,1,1,1,0,0,0
4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0


In [218]:
X_train

6    virginamerica yes nearly every time fly vx ear...
0                          virginamerica dhepburn said
2    virginamerica didnt today must mean need take ...
4                   virginamerica really big bad thing
Name: text, dtype: object

In [219]:
X_train[6]

'virginamerica yes nearly every time fly vx ear worm go away'

In [220]:
df_test_count = pd.DataFrame(X_test_count.toarray(), columns = vectorizer.get_feature_names_out(), index = X_test.index)
df_test_count

Unnamed: 0,another,away,bad,big,dhepburn,didnt,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0


In [221]:
X_test

3    virginamerica really aggressive blast obnoxiou...
5    virginamerica seriously would pay flight seat ...
1    virginamerica plus youve added commercial expe...
7    virginamerica really missed prime opportunity ...
Name: text, dtype: object

In [222]:
vectorizer.vocabulary_

{'virginamerica': 21,
 'yes': 24,
 'nearly': 12,
 'every': 7,
 'time': 18,
 'fly': 8,
 'vx': 22,
 'ear': 6,
 'worm': 23,
 'go': 9,
 'away': 1,
 'dhepburn': 4,
 'said': 15,
 'didnt': 5,
 'today': 19,
 'must': 11,
 'mean': 10,
 'need': 13,
 'take': 16,
 'another': 0,
 'trip': 20,
 'really': 14,
 'big': 3,
 'bad': 2,
 'thing': 17}

In [223]:
# TF-IDF

In [224]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [225]:
tf_idf_vectorizer = TfidfVectorizer()

In [226]:
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

In [227]:
tf_idf_vectorizer.get_feature_names_out()

array(['another', 'away', 'bad', 'big', 'dhepburn', 'didnt', 'ear',
       'every', 'fly', 'go', 'mean', 'must', 'nearly', 'need', 'really',
       'said', 'take', 'thing', 'time', 'today', 'trip', 'virginamerica',
       'vx', 'worm', 'yes'], dtype=object)

In [228]:
X_train_tf_idf.toarray()

array([[0.        , 0.31200802, 0.        , 0.        , 0.        ,
        0.        , 0.31200802, 0.31200802, 0.31200802, 0.31200802,
        0.        , 0.        , 0.31200802, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.31200802, 0.        ,
        0.        , 0.16281873, 0.31200802, 0.31200802, 0.31200802],
       [0.        , 0.        , 0.        , 0.        , 0.66338461,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.66338461, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.34618161, 0.        , 0.        , 0.        ],
       [0.34768534, 0.        , 0.        , 0.        , 0.        ,
        0.34768534, 0.        , 0.        , 0.        , 0.        ,
        0.34768534, 0.34768534, 0.        , 0.34768534, 0.        ,
        0.        , 0.34768534, 0.        , 0.        , 0.34768534,
        0.34768534, 0.18143663, 0.        , 0.

In [229]:
df_train_tfidf = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out(), 
                              index= X_train.index)
df_train_tfidf

Unnamed: 0,another,away,bad,big,dhepburn,didnt,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
6,0.0,0.312008,0.0,0.0,0.0,0.0,0.312008,0.312008,0.312008,0.312008,0.0,0.0,0.312008,0.0,0.0,0.0,0.0,0.0,0.312008,0.0,0.0,0.162819,0.312008,0.312008,0.312008
0,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.663385,0.0,0.0,0.0,0.0,0.0,0.346182,0.0,0.0,0.0
2,0.347685,0.0,0.0,0.0,0.0,0.347685,0.0,0.0,0.0,0.0,0.347685,0.347685,0.0,0.347685,0.0,0.0,0.347685,0.0,0.0,0.347685,0.347685,0.181437,0.0,0.0,0.0
4,0.0,0.0,0.483803,0.483803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.252468,0.0,0.0,0.0


In [230]:
df_train_tfidf.loc[0].sort_values(ascending= False )

dhepburn         0.663385
said             0.663385
virginamerica    0.346182
another          0.000000
need             0.000000
worm             0.000000
vx               0.000000
trip             0.000000
today            0.000000
time             0.000000
thing            0.000000
take             0.000000
really           0.000000
nearly           0.000000
away             0.000000
must             0.000000
mean             0.000000
go               0.000000
fly              0.000000
every            0.000000
ear              0.000000
didnt            0.000000
big              0.000000
bad              0.000000
yes              0.000000
Name: 0, dtype: float64

In [231]:
df_test_tfidf=pd.DataFrame(X_test_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names(), index = X_test.index)
df_test_tfidf

Unnamed: 0,another,away,bad,big,dhepburn,didnt,ear,every,fly,go,mean,must,nearly,need,really,said,take,thing,time,today,trip,virginamerica,vx,worm,yes
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.886548,0.0,0.0,0.0,0.0,0.0,0.0,0.462637,0.0,0.0,0.0
5,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483803,0.0,0.0,0.483803,0.0,0.0,0.0,0.252468,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.886548,0.0,0.0,0.0,0.0,0.0,0.0,0.462637,0.0,0.0,0.0
