In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, roc_auc_score, roc_curve, auc, f1_score, recall_score, precision_score, accuracy_score
from sklearn.pipeline import Pipeline

#Classifiers
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

#reproducibility
import os

rs = {'random_state': 10}

from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')

random_seed = 10

np.random.seed(random_seed)

import gensim
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense, Conv2D, BatchNormalization, MaxPooling2D, Flatten

from gensim.models import Word2Vec

import json

tf.random.set_seed(10)

In [3]:
data = pd.read_csv('tweets.csv', index_col=0)
data.head()

Unnamed: 0,text,target,num_hashtags,hashtags,num_mentions,mentions,emojis,num_chars,num_words,mean_word_length,num_proper_nouns,num_nouns,tokens,lemmas
0,deeds earthquake allah forgive,1,1,['#earthquake'],0,[],0,68,13,4.307692,0,2,"['deeds', 'earthquake', 'allah', 'forgive']","['deed', 'earthquake', 'allah', 'forgive']"
1,near ronge sask canada,1,0,[],0,[],0,37,7,4.428571,0,3,"['near', 'ronge', 'sask', 'canada']","['near', 'ronge', 'sask', 'canada']"
2,residents asked shelter notified officers shel...,1,0,[],0,[],0,130,22,4.954545,0,5,"['residents', 'asked', 'shelter', 'notified', ...","['resident', 'asked', 'shelter', 'notified', '..."
3,receive wildfires orders california,1,1,['#wildfires'],0,[],0,56,7,7.142857,0,2,"['receive', 'wildfires', 'orders', 'california']","['receive', 'wildfire', 'order', 'california']"
4,sent ruby alaska wildfires pours,1,2,"['#Alaska', '#wildfires']",0,[],0,85,16,4.375,0,3,"['sent', 'ruby', 'alaska', 'wildfires', 'pours']","['sent', 'ruby', 'alaska', 'wildfire', 'pours']"


In [4]:
data = data.drop(['num_hashtags', 'hashtags', 'num_mentions',
       'mentions', 'emojis', 'num_chars', 'num_words', 'mean_word_length',
       'num_proper_nouns', 'num_nouns', 'tokens'], axis = 1)

In [9]:
data.head()

Unnamed: 0,text,target,lemmas
0,deeds earthquake allah forgive,1,"['deed', 'earthquake', 'allah', 'forgive']"
1,near ronge sask canada,1,"['near', 'ronge', 'sask', 'canada']"
2,residents asked shelter notified officers shel...,1,"['resident', 'asked', 'shelter', 'notified', '..."
3,receive wildfires orders california,1,"['receive', 'wildfire', 'order', 'california']"
4,sent ruby alaska wildfires pours,1,"['sent', 'ruby', 'alaska', 'wildfire', 'pours']"


In [18]:
X = data['lemmas'].values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 10)

In [19]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train.ravel())
y_test = label_encoder.fit_transform(y_test.ravel())

First, we need to decide on the number of features for our word2vector model. We will try between 100 to 300. This will form the vector size for our word to vector models. 

In [20]:
number_of_features = 100
model = Word2Vec(X_train,
                 vector_size = number_of_features,
                 window = 150,
                 min_count = 10, 
                 workers=4, 
                 epochs=5
                )

Now, we want to generate averaged word vecto features from our word2vector model.

In [21]:
vocabulary = set(model.wv.index_to_key)
feature_vector = np.zeros((number_features,), dtypes='float64')
num_words = 0
for word in words:
    if word in vocabulary:
        num_words = num_words + 1
        feature_vector = np.divide(feature_words, num_words)
features = [average]

{' ',
 "'",
 ',',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [28]:

import pandas as pd
import numpy as np

pd.options.display.max_colwidth = 200

corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beautiful today,weather
7,The dog is lazy but the brown fox is quick!,animals


In [30]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\ProgramData\Anaconda3\lib\nltk_data...


PermissionError: [WinError 5] Access is denied: 'C:\\ProgramData\\Anaconda3\\lib\\nltk_data\\corpora\\stopwords.zip'