# Handling Text Data

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('AllTweets.csv')

In [3]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA


In [4]:
len(df)

88625

In [5]:
df['author'].unique()

array(['NASA', 'AdamSavage', 'various', 'BarackObama', 'DonaldTrump',
       'FiveThirtyEight', 'HillaryClinton', 'KimKardashian',
       'deGrasseTyson', 'ScottKelly', 'RichardDawkins'], dtype=object)

In [6]:
df.loc[0,'text']

'New software on the @Space_Station will make data communications faster and easier for hundreds of scientists: http://go.nasa.gov/2dQrLto\xa0pic.twitter.com/weIfEomT9x'

In [8]:
t = 'New software on the @Space_Station will make data communications faster and easier for hundreds of scientists:'+\
        'http://go.nasa.gov/2dQrLto\xa0pic.twitter.com/weIfEomT9x'

#### To Lowercase:

In [9]:
t = t.lower()
t

'new software on the @space_station will make data communications faster and easier for hundreds of scientists:http://go.nasa.gov/2dqrlto\xa0pic.twitter.com/weifeomt9x'

In [10]:
import nltk

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\H\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\H\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Split text

In [13]:
tokenized_text = nltk.tokenize.word_tokenize(t)
tokenized_text

['new',
 'software',
 'on',
 'the',
 '@',
 'space_station',
 'will',
 'make',
 'data',
 'communications',
 'faster',
 'and',
 'easier',
 'for',
 'hundreds',
 'of',
 'scientists',
 ':',
 'http',
 ':',
 '//go.nasa.gov/2dqrlto',
 'pic.twitter.com/weifeomt9x']

#### Keep only alphabetic words

In [14]:
alphabetic_text = [w for w in tokenized_text if w.isalpha()]
alphabetic_text

['new',
 'software',
 'on',
 'the',
 'will',
 'make',
 'data',
 'communications',
 'faster',
 'and',
 'easier',
 'for',
 'hundreds',
 'of',
 'scientists',
 'http']

#### Remove stop words:

In [15]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
t = [w for w in alphabetic_text if w not in stopwords]
t

['new',
 'software',
 'make',
 'data',
 'communications',
 'faster',
 'easier',
 'hundreds',
 'scientists',
 'http']

#### Stem words

In [17]:
stemmer = nltk.stem.porter.PorterStemmer()
stemmed = [stemmer.stem(w) for w in t]
stemmed

['new',
 'softwar',
 'make',
 'data',
 'commun',
 'faster',
 'easier',
 'hundr',
 'scientist',
 'http']

In [18]:
new_text = ' '.join(stemmed)
new_text

'new softwar make data commun faster easier hundr scientist http'

#### Merge everything in a function

In [19]:
def clean_text(t):    
    tokenized_text = nltk.tokenize.word_tokenize(t.lower())
    alphabetic_text = [w for w in tokenized_text if w.isalpha()]
    alphabetic_text
    t = [w for w in alphabetic_text if w not in stopwords]
    stemmed = [stemmer.stem(w) for w in t]
    new_text = ' '.join(stemmed)
    return new_text

In [20]:
clean_text(df.loc[1,'text'])

'readi launch orbitalatk antar rocket cargo ship set lift et sunday http'

#### Add new column for cleaned text

In [21]:
df['cleaned_text'] = df['text'].map(clean_text)
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...


## Prepare training Data

### Vectorize text

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100)

In [23]:
tfidf.fit(df['cleaned_text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=100,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [24]:
new_text

'new softwar make data commun faster easier hundr scientist http'

In [25]:
x = tfidf.transform([new_text]).todense().getA()[0]
x
np.argwhere(x!=0)

array([[15],
       [31],
       [47],
       [56]], dtype=int64)

In [26]:
list(zip(np.array(tfidf.get_feature_names())[x!=0],x[x!=0]))

[('data', 0.5709914684287255),
 ('http', 0.21803050543601585),
 ('make', 0.5777520658974478),
 ('new', 0.5409565528137676)]

In [27]:
x = tfidf.transform(df['cleaned_text'])

In [28]:
x

<88625x100 sparse matrix of type '<class 'numpy.float64'>'
	with 234634 stored elements in Compressed Sparse Row format>

### Additional Features, if you want!

#### Number of words in each text

In [29]:
df['words_count'] = df['text'].map(lambda t: len(t.split(' ')))

In [30]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text,words_count
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...,17
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...,20
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...,20
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...,18
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...,17


#### number of punctuation symbols in original text

In [31]:
df['punct_count'] = df['text'].map(lambda t: len([x for x in t if x in (';', ':', ',', '.', '!', '?')]))

In [32]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text,words_count,punct_count
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...,17,6
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...,20,8
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...,20,9
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...,18,7
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...,17,9


In [33]:
df['stopwords_count'] = df['text'].map(lambda t: len([x for x in t if x in stopwords]))

In [34]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text,words_count,punct_count,stopwords_count
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...,17,6,67
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...,20,8,51
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...,20,9,47
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...,18,7,44
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...,17,9,54


#### Merge new features with original ones

In [35]:
x

<88625x100 sparse matrix of type '<class 'numpy.float64'>'
	with 234634 stored elements in Compressed Sparse Row format>

In [36]:
new_features = df[['words_count','punct_count','stopwords_count']].to_numpy()
new_features

array([[17,  6, 67],
       [20,  8, 51],
       [20,  9, 47],
       ...,
       [11,  3, 25],
       [19,  1, 39],
       [18,  5, 42]], dtype=int64)

In [37]:
new_x = np.concatenate((x.toarray(),new_features), axis=1)

In [158]:
new_x

array([[ 0.        ,  0.        ,  0.        , ..., 17.        ,
         6.        , 67.        ],
       [ 0.        ,  0.        ,  0.        , ..., 20.        ,
         8.        , 51.        ],
       [ 0.        ,  0.        ,  0.        , ..., 20.        ,
         9.        , 47.        ],
       ...,
       [ 0.        ,  0.72377904,  0.        , ..., 11.        ,
         3.        , 25.        ],
       [ 0.        ,  0.        ,  0.        , ..., 19.        ,
         1.        , 39.        ],
       [ 0.        ,  0.        ,  0.        , ..., 18.        ,
         5.        , 42.        ]])

In [38]:
new_x.shape

(88625, 103)

# Splitting Data

In [39]:
len(df)

88625

In [40]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size = 0.3)

In [41]:
len(train_df), len(test_df)

(62037, 26588)

In [42]:
X = new_x
X.shape

(88625, 103)

In [43]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y = encoder.fit_transform(df[['author']]).toarray()
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)

In [45]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((70900, 103), (17725, 103), (70900, 11), (17725, 11))