# RNN

In [0]:
import re

import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# from google.colab import drive
# drive.mount('/content/gdrive')

# import os
# os.chdir('gdrive/My Drive/Colab Notebooks')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
url = 'https://raw.githubusercontent.com/k0nci/oznal/master/data/mbti_1.csv'

raw_data = pd.read_csv(url)
raw_data.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,Captain phillips|||:happy: nice|||I'm currentl...
freq,1832,1


In [0]:
data = raw_data['posts'].apply(lambda x: pd.Series([y.strip() for y in x.split('|||')])) \
                        .reset_index() \
                        .rename(columns={'index': 'user_id'}) \
                        .merge(raw_data, left_index=True, right_index=True) \
                        .drop(columns=['posts']) \
                        .melt(id_vars=['user_id', 'type'], value_name='post', var_name='post_id') \
                        .dropna(subset=['post']) \
                        .sort_values(by=['user_id', 'post_id']) \
                        .reset_index(drop=True)

data.describe()

Unnamed: 0,user_id
count,422845.0
mean,4340.423529
std,2506.825488
min,0.0
25%,2165.0
50%,4345.0
75%,6515.0
max,8674.0


In [0]:
data['I'] = data['type'].apply(lambda x: 1 if x[0] == 'I' else 0)
data['E'] = data['type'].apply(lambda x: 1 if x[0] == 'E' else 0)

data['N'] = data['type'].apply(lambda x: 1 if x[1] == 'N' else 0)
data['S'] = data['type'].apply(lambda x: 1 if x[1] == 'S' else 0)

data['T'] = data['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
data['F'] = data['type'].apply(lambda x: 1 if x[2] == 'F' else 0)

data['J'] = data['type'].apply(lambda x: 1 if x[3] == 'J' else 0)
data['P'] = data['type'].apply(lambda x: 1 if x[3] == 'P' else 0)

data.head()

Unnamed: 0,user_id,type,post_id,post,I,E,N,S,T,F,J,P
0,0,INFJ,0,'http://www.youtube.com/watch?v=qsXHcwe3krw,1,0,1,0,0,1,1,0
1,0,INFJ,1,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,1,0,1,0,0,1,1,0
2,0,INFJ,2,enfp and intj moments https://www.youtube.com...,1,0,1,0,0,1,1,0
3,0,INFJ,3,What has been the most life-changing experienc...,1,0,1,0,0,1,1,0
4,0,INFJ,4,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,1,0,1,0,0,1,1,0


In [0]:
data['urls_count'] = data['post'].apply(lambda x: len(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', x)))

data['words_count'] = data['post'].apply(lambda x: len(x.split()))
data['sentences_count'] = data['post'].apply(lambda x: len(sent_tokenize(x)))
data['words_per_sentence'] = data['words_count'] / data['sentences_count']

In [0]:
data['post'] = data['post'].apply(lambda x: x.lower().strip())
data['post'] = data['post'].apply(lambda x: re.sub(r" +", " ", x))

data['post'] = data['post'].apply(lambda x: re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", x))
data['post'] = data['post'].apply(lambda x: re.sub(r"/[-\/\\^$*+?.()|[\]{}]/g", "", x))
data['post'] = data['post'].apply(lambda x: re.sub(r"[iex\*][nsx\*][ftx\*][pjx\*]", "", x))
data['post'] = data['post'].apply(lambda x: re.sub(r"[0-9]+", "", x))
data['post'] = data['post'].apply(lambda x: re.sub(r"#[a-zA-Z]+", "", x))
data['post'] = data['post'].apply(lambda x: re.sub(r"['\";:,.?!\/\\()\[\]+]", "", x))
data['post'] = data['post'].apply(lambda x: re.sub(r"[-_]", " ", x))
data['post'] = data['post'].apply(lambda x: re.sub(r" +", " ", x))

data = data[data['post'] != '']

In [0]:
def merge_post(series, sep=' '):
  return series.str.cat(sep=sep)

user_data = data.groupby('user_id').agg({
    'I': 'max',
    'post': merge_post,
    'words_count': ['mean', 'var'],
    'sentences_count': 'mean'
})

user_data.columns = ["_".join(y) for y in user_data.columns.ravel()]

user_data.rename(
    inplace=True,
    columns={
        'I_max': 'I',
        'post_merge_post': 'posts',
        'words_count_mean': 'words_per_post',
        'sentences_count_mean': 'sentences_per_post'
    }    
)


user_data.head()

Unnamed: 0_level_0,I,posts,words_per_post,words_count_var,sentences_per_post
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,and moments sportscenter not top ten plays pr...,16.416667,124.821429,1.638889
1,0,im finding the lack of me in these posts very ...,25.87234,165.852914,2.723404
2,1,good one of course to which i say i know that...,20.880952,155.570848,1.97619
3,1,dear i enjoyed our conversation the other day ...,22.26,185.543265,2.3
4,0,youre fired thats another silly misconception ...,21.553191,187.643848,2.382979


In [0]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

y = user_data['I']
X = user_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

df_majority = X_train[X_train['I']==1]
df_minority = X_train[X_train['I']==0]
 
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=X_train[X_train['I']==1].shape[0],    
                                 random_state=123) 
 
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
X_train = df_upsampled

y_train = X_train['I']

X_train = X_train.drop(columns='I')
X_test = X_test.drop(columns='I')



In [0]:
X_train

Unnamed: 0_level_0,posts,words_per_post,words_count_var,sentences_per_post
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5138,you know what you can do show him a more accur...,28.420000,146.166939,3.000000
6148,youve gotta be starving youve gotta be starvin...,14.578947,141.331437,1.973684
5343,god dangit now the song listen to your heart i...,22.200000,171.102041,2.020000
3530,nomadleviathan yeah it can take this form as w...,31.652174,61.654106,3.021739
1932,well he didnt appear toxic when i met him a bi...,31.638889,93.837302,2.916667
4055,i ticked volunteering as one of my answers so ...,28.437500,149.953457,2.291667
4927,im way too forgiving i am not a door slammer i...,33.240000,97.328980,2.900000
6811,also ive had this bizarre in depth fascination...,33.857143,56.541667,2.653061
1091,coulda been better hello and welcome from a fe...,24.180000,188.640408,3.080000
1753,they are usually the kind of girls who you can...,22.687500,220.942819,2.833333


In [0]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

TOP_K = 10000

MAX_SEQUENCE_LENGTH = 5000

def sequence_vectorize(train_texts, val_texts):
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    return x_train, x_val, tokenizer.word_index, max_length, TOP_K, [key for key, value in tokenizer.word_index.items() if value <= TOP_K]


 

In [0]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('wordnet')

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

    
def remove_stop_words_and_lemmatize(data):
  lemmatizer=WordNetLemmatizer()
  stemmer = PorterStemmer()

  res = []
  for a in data:
    input_str = word_tokenize(a)
    temp_str = ""
    for word in input_str:
      if word in stop_words:
        continue
      temp_str += stemmer.stem(lemmatizer.lemmatize(word))
      temp_str += lemmatizer.lemmatize(word)
      temp_str += " "
    temp_str = temp_str[:-1]
    res.append(temp_str)
  return res

x_train, x_val = X_train.posts.values, X_test.posts.values
x_train = remove_stop_words_and_lemmatize(x_train)
x_val = remove_stop_words_and_lemmatize(x_val)

x_train, x_val, word_index, max_length, vocab_size, all_words = sequence_vectorize(x_train, x_val)
num_features = min(len(word_index) + 1, TOP_K)

Using TensorFlow backend.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
def RNN():
  model = Sequential()
  model.add(Embedding(num_features, output_dim=64 , input_length=max_length))
  model.add(Dropout(0.5))
  model.add(Conv1D(filters=16, kernel_size=3, padding='same', activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Dropout(0.5))
  model.add(LSTM(16))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))
  return model

model = RNN()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_test), epochs=3, batch_size=64)

Train on 10708 samples, validate on 1735 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f2b902584a8>

In [0]:
y_pred = model.predict(x_val)
y_pred = [1 if a > 0.5 else 0 for a in y_pred]

In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.34      0.42      0.38       413
           1       0.81      0.75      0.78      1322

   micro avg       0.67      0.67      0.67      1735
   macro avg       0.58      0.59      0.58      1735
weighted avg       0.70      0.67      0.68      1735



array([[175, 238],
       [333, 989]])

In [0]:
f1_score(y_test, y_pred, average='macro')

In [0]:
(y_train.sum() + y_test.sum()) / (len(y_train) + len(y_test))