# Preprocessing and Modeling

Here we will build our models to accurately determine if a post is from TheSilphRoad or pokemongo subreddit. 

In [2]:
# import our libraries

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from collections import Counter
from googletrans import Translator, LANGUAGES
from time import sleep

In [3]:
# import dataset

merged_df = pd.read_csv('../data/merged.csv')
merged_df.head()

Unnamed: 0,subreddit,merged_text
0,TheSilphRoad,Fix to not being able to attack? Has anybody f...
1,TheSilphRoad,Attack glitch during Regi raids 2 raids today ...
2,TheSilphRoad,[Bug?] Can’t seem to earn or collect pokecoins...
3,TheSilphRoad,"[Bug?] AR suddenly freezes Using an iPhone 11,..."
4,TheSilphRoad,3 hour incense event personal results For any ...


In [4]:
# set TheSilphRoad to 1 and pokemongo to 0
merged_df['subreddit'] = np.where(merged_df['subreddit'] == 'TheSilphRoad', 1, 0)
merged_df.head()

Unnamed: 0,subreddit,merged_text
0,1,Fix to not being able to attack? Has anybody f...
1,1,Attack glitch during Regi raids 2 raids today ...
2,1,[Bug?] Can’t seem to earn or collect pokecoins...
3,1,"[Bug?] AR suddenly freezes Using an iPhone 11,..."
4,1,3 hour incense event personal results For any ...


In [5]:
merged_df.dropna(inplace = True)
merged_df.isnull().sum()

subreddit      0
merged_text    0
dtype: int64

In [6]:
merged_df.shape

(19344, 2)

In [7]:
# # create a custom stop words list to remove all similar words we found in the previous notebook

# start with the base english stopwords
new_stopwords = stopwords.words('english')

# add stopwords that will easily identify a silph post, we do not worry about making ones for the pokemongo as both subreddits are related to pokemongo
# also added stopwords that are common across both subreddits
custom_words = ['silph', 'road', 'silphroad', 'pokemon', 'go', 'get', 'one', 'like', 'would', 'know', 'time', 'game']

new_stopwords.extend(custom_words)

In [8]:
# initialize CountVectorizer

cvec = CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')
X = merged_df['merged_text']

X = cvec.fit_transform(X)
text_df = pd.DataFrame(X.todense(), columns = cvec.get_feature_names())
text_df.head()

Unnamed: 0,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,000exp,...,хрусть,что,шикарно,это,から毎週金曜よる6時55分にお引っ越しすることを記念し,そのうち10月6日,シャトウ,テレヒ東京系にて放送中のアニメ,ホケットモンスター,特別な内容て開催されます
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Here we see that we have some text in other languages where CountVectorizer doesn't return the expected result. This needs to be handled before we can move on to modeling. In order to take care of this, we will use a Google Translate library to translate any posts that aren't English into English. 

In [9]:
# thanks to this article for the following code: https://medium.com/analytics-vidhya/popular-python-libraries-in-nlp-dealing-with-language-detection-translation-beyond-7b8e7cb2928e

# initialize Translator
trans = Translator()
texts = merged_df['merged_text'].copy()
for i in range(len(texts)):
    try:
        texts.loc[i] = trans.translate(texts[i]).text
    except:
        pass

# countVectorize
cvec = CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')

X = texts

X = cvec.fit_transform(X)
text_df = pd.DataFrame(X.todense(), columns = cvec.get_feature_names())
text_df.head()

Unnamed: 0,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,000exp,...,так,теперь,то,трубы,удар,хрусть,что,шикарно,это,シャトウ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
text_df[text_df['ðætte'] == 1]

Unnamed: 0,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,000exp,...,zyonik,zyvq62h,zzmldqoo2e,zzsa8wa,zzy8nnk,zzyaatk,ðe,ðeah,ðone,ðætte
15186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1


In [18]:
texts.index[1201]

1201

In [24]:
texts.loc[15186]

'Is this a bug? https://i.imgur.com/e3sGsy9.png\n\n[Attraction with pron ðe ic yfel [got] mîn [chamberfellow] wiðtêon hât, yfel [noticed] ðone as [there] [were] siex [hearts] [self-revelatory] onunder [bataan-corregidor] ðêah yfel [was] [wonder-struck\n] sam ðætte [is] hlêg [radiomicrophone]by [supposititious] [succedaneum] bêga [feature]]()\n\nHas anyone seen this before?'

In [26]:
# as the translation library only translates when the majority of text is another language, we need to do a few manually
texts.loc[902] = trans.translate(texts[902], src = 'ru', dest = 'en').text
texts.loc[1078] = trans.translate(texts[1078], src = 'ru', dest = 'en').text
texts.loc[1198] = trans.translate(texts[1198], src = 'ru', dest = 'en').text
texts.loc[1201] = trans.translate(texts[1201], src = 'ru', dest = 'en').text
texts.loc[6249] = trans.translate(texts[6249], src = 'ja', dest = 'en').text
texts.loc[8311] = 'Rank 9 player with &lt;2500 points Hi pvp guys, I think my GBL It\'s buggy (Bugs in GBL is kinda new right?@_@)\nToday i hit rank9 with 55% winrate, is practically the same as many others with a score of 2550+, but i only got 2200points which is too low for a rank9.\nAt first I thought it would be something related to the "hidden ELO" so I continued to battle and managed to get a 1-4 and i didn\'t lose points\nSo I was wondering if this happened to anyone else and if there is anyone who can help me, maybe @NianticHelp'
texts.loc[15186] = 'Is this a bug? https://i.imgur.com/e3sGsy9.png\n\n[Attraction with as [there] [were] six [hearts] Has anyone seen this before?'

# countVectorize
cvec = CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')

X = texts

X = cvec.fit_transform(X)
text_df = pd.DataFrame(X.todense(), columns = cvec.get_feature_names())
text_df.head()

Unnamed: 0,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,000exp,...,zygarde_cell,zygarde_complete,zygarde_core,zygrade,zyonik,zyvq62h,zzmldqoo2e,zzsa8wa,zzy8nnk,zzyaatk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# get a list of language codes to determine the source language code to use
LANGCODES = dict(map(reversed, LANGUAGES.items()))
print(LANGCODES)

{'afrikaans': 'af', 'albanian': 'sq', 'amharic': 'am', 'arabic': 'ar', 'armenian': 'hy', 'azerbaijani': 'az', 'basque': 'eu', 'belarusian': 'be', 'bengali': 'bn', 'bosnian': 'bs', 'bulgarian': 'bg', 'catalan': 'ca', 'cebuano': 'ceb', 'chichewa': 'ny', 'chinese (simplified)': 'zh-cn', 'chinese (traditional)': 'zh-tw', 'corsican': 'co', 'croatian': 'hr', 'czech': 'cs', 'danish': 'da', 'dutch': 'nl', 'english': 'en', 'esperanto': 'eo', 'estonian': 'et', 'filipino': 'tl', 'finnish': 'fi', 'french': 'fr', 'frisian': 'fy', 'galician': 'gl', 'georgian': 'ka', 'german': 'de', 'greek': 'el', 'gujarati': 'gu', 'haitian creole': 'ht', 'hausa': 'ha', 'hawaiian': 'haw', 'hebrew': 'he', 'hindi': 'hi', 'hmong': 'hmn', 'hungarian': 'hu', 'icelandic': 'is', 'igbo': 'ig', 'indonesian': 'id', 'irish': 'ga', 'italian': 'it', 'japanese': 'ja', 'javanese': 'jw', 'kannada': 'kn', 'kazakh': 'kk', 'khmer': 'km', 'korean': 'ko', 'kurdish (kurmanji)': 'ku', 'kyrgyz': 'ky', 'lao': 'lo', 'latin': 'la', 'latvian': 

Now that we have resolved the issues with other languages, we can finally move onto creating a model to differentiate posts between TheSilphRoad and pokemongo subreddits. 

In [28]:
# combine countvectorized dataframe with dataframe containting our subreddits

singlegram_df = pd.concat([merged_df['subreddit'], text_df], axis = 1)
singlegram_df.head()

Unnamed: 0,subreddit,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,...,zygarde_cell,zygarde_complete,zygarde_core,zygrade,zyonik,zyvq62h,zzmldqoo2e,zzsa8wa,zzy8nnk,zzyaatk
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
