# Tweets Classification

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('AllTweets.csv')

In [3]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA


In [4]:
len(df)

88625

Let's remove the retweets and tweets by "various"

In [5]:
df = df[df['retweet']==False]

In [6]:
len(df)

84977

In [7]:
df['author'].unique()

array(['NASA', 'AdamSavage', 'various', 'BarackObama', 'DonaldTrump',
       'FiveThirtyEight', 'HillaryClinton', 'KimKardashian',
       'deGrasseTyson', 'ScottKelly', 'RichardDawkins'], dtype=object)

In [8]:
df = df[df['author']!='various']
len(df)

74543

#### Prepreocess Text

In [9]:
import nltk

In [10]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ghais\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
stemmer = nltk.stem.porter.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(t):    
    tokenized_text = nltk.tokenize.word_tokenize(t.lower())
    alphabetic_text = [w for w in tokenized_text if w.isalpha()]
    alphabetic_text
    t = [w for w in alphabetic_text if w not in stopwords]
    stemmed = [stemmer.stem(w) for w in t]
    new_text = ' '.join(stemmed)
    return new_text

#### Add new column for cleaned text

In [12]:
df['cleaned_text'] = df['text'].map(clean_text)
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...


## Prepare training Data

### Vectorize text

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2000)

In [14]:
tfidf.fit(df['cleaned_text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=2000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [15]:
X = tfidf.transform(df['cleaned_text'])

In [16]:
X

<74543x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 517958 stored elements in Compressed Sparse Row format>

### Additional Features, if you want!

#### Number of words in each text

In [17]:
df['words_count'] = df['text'].map(lambda t: len(t.split(' ')))

In [18]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text,words_count
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...,17
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...,20
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...,20
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...,18
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...,17


#### number of punctuation symbols in original text

In [19]:
df['punct_count'] = df['text'].map(lambda t: len([x for x in t if x in (';', ':', ',', '.', '!', '?')]))

In [20]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text,words_count,punct_count
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...,17,6
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...,20,8
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...,20,9
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...,18,7
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...,17,9


In [21]:
df['stopwords_count'] = df['text'].map(lambda t: len([x for x in t if x in stopwords]))

In [22]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author,cleaned_text,words_count,punct_count,stopwords_count
0,18h18 hours ago,7.87029e+17,/NASA/status/787029214612586496,False,New software on the @Space_Station will make d...,NASA,new softwar make data commun faster easier hun...,17,6,67
1,19h19 hours ago,7.87009e+17,/NASA/status/787008587927281664,False,READY for Launch! @OrbitalATK's #Antares rocke...,NASA,readi launch orbitalatk antar rocket cargo shi...,20,8,51
2,17h17 hours ago,7.87042e+17,/NASA/status/787042156116664320,False,Rocket launch at 8:03pm ET Sunday to deliver c...,NASA,rocket launch et sunday deliv cargo launch vie...,20,9,47
3,47m47 minutes ago,7.87284e+17,/NASA/status/787284414732472320,False,What happened this week at NASA? @POTUS outlin...,NASA,happen week nasa potu outlin space explor futu...,18,7,44
4,24h24 hours ago,7.86934e+17,/NASA/status/786934496625102848,False,Energize! Study makes sure astronauts have ene...,NASA,energ studi make sure astronaut energi need wa...,17,9,54


#### Merge new features with original ones

In [23]:
X

<74543x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 517958 stored elements in Compressed Sparse Row format>

In [24]:
new_features = df[['words_count','punct_count','stopwords_count']].to_numpy()
new_features

array([[17,  6, 67],
       [20,  8, 51],
       [20,  9, 47],
       ...,
       [11,  3, 25],
       [19,  1, 39],
       [18,  5, 42]], dtype=int64)

In [25]:
new_x = np.concatenate((X.toarray(),new_features), axis=1)

In [26]:
new_x

array([[ 0.,  0.,  0., ..., 17.,  6., 67.],
       [ 0.,  0.,  0., ..., 20.,  8., 51.],
       [ 0.,  0.,  0., ..., 20.,  9., 47.],
       ...,
       [ 0.,  0.,  0., ..., 11.,  3., 25.],
       [ 0.,  0.,  0., ..., 19.,  1., 39.],
       [ 0.,  0.,  0., ..., 18.,  5., 42.]])

In [27]:
new_x.shape

(74543, 2003)

# Splitting Data

In [28]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y = encoder.fit_transform(df[['author']]).toarray()
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_x,y,train_size=0.8)

In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((59634, 2003), (14909, 2003), (59634, 10), (14909, 10))

# Build Neural Network

In [31]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

In [32]:
model = Sequential()
model.add(Input(shape=2003))
model.add(Dense(1000))
model.add(Dense(100))
model.add(Dense(10, activation='softmax'))

In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              2004000   
_________________________________________________________________
dense_1 (Dense)              (None, 100)               100100    
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
Total params: 2,105,110
Trainable params: 2,105,110
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

In [35]:
model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test),batch_size=1000)

Train on 59634 samples, validate on 14909 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1ff554138c8>

In [36]:
def predict_text():
    text = input("Enter tweet: ")
    text_features = tfidf.transform([clean_text(text)])
    words_count = len(text.split(' '))
    punct_count = len([x for x in text if x in (';', ':', ',', '.', '!', '?')])
    stopwords_count = len([x for x in text if x in stopwords])
    new_features=np.concatenate((text_features.toarray(),[[words_count, punct_count, stopwords_count]]), axis=1)
    prediction = model.predict(new_features)
    predicted_index = np.argmax(prediction)
    return encoder.categories_[0][predicted_index]

In [37]:
predict_text()

Enter tweet: The Invisible Enemy will soon be in full retreat!


'DonaldTrump'