In [26]:
import pandas as pd, numpy as np, json, re, pickle
import string
from collections import Counter
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction import DictVectorizer
from nltk import pos_tag
from nltk import word_tokenize
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import classification_report

In [2]:
def read_data(file):
    """
    Take a json file location and
    read the file into a pandas data frame
    Args: full path to file
    Returns: pandas dataframe with data from file
    """
    
    data = []

    with open(file) as f:
        for line in f:
            data.append(json.loads(line))
        
    # convert to data frame
    
    return pd.DataFrame(data)

In [3]:
df = read_data('categorized-comments.jsonl')

# check size, structure and categories

print('Size: ', len(df), '\n',
      'Shape: ', df.info(), '\n',
      'Categories: ', df.cat.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606474 entries, 0 to 606473
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   cat     606474 non-null  object
 1   txt     606474 non-null  object
dtypes: object(2)
memory usage: 9.3+ MB
Size:  606474 
 Shape:  None 
 Categories:  ['sports' 'science_and_technology' 'video_games']


In [4]:
df = df[:50000]

In [5]:
df.apply(lambda x: x.astype(str).str.lower())

Unnamed: 0,cat,txt
0,sports,barely better than gabbert? he was significant...
1,sports,should have drafted more wrs.\n\n- matt millen...
2,sports,[done](https://i.imgur.com/2yz90pm.jpg)
3,sports,no!! noo!!!!!
4,sports,ding dong the kaepers gone!!!!!! yes!!!! frida...
...,...,...
49995,sports,never does.
49996,sports,i think tosh is our best one left. napier is g...
49997,sports,"close, and we could really use a tight end..."
49998,sports,yeah but that didn't really end all that well ...


In [6]:
#remove punctuation
df['txt'] = df['txt'].apply(lambda x:''.join([i for i in x 
                                                  if i not in string.punctuation]))

In [7]:
stop = stopwords.words('english')

In [8]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [10]:
df['txt'] = df['txt'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))

In [11]:
print(df['txt'])

0        barely better gabbert significantly better yea...
1                         drafted wrs matt millen probably
2                             donehttpsiimgurcom2yz90pmjpg
3                                                      noo
4             ding dong kaepers gone yes friday good start
                               ...                        
49995                                                never
49996    think tosh best one left napier gone cristobal...
49997                     close could really use tight end
49998                           yeah didnt really end well
49999                                      fucking kidding
Name: txt, Length: 50000, dtype: object


In [12]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
porter_stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['txt'] = df['txt'].apply(stem_sentences)

In [13]:
df["txt"]

0        bare better gabbert significantli better year ...
1                             draft wr matt millen probabl
2                             donehttpsiimgurcom2yz90pmjpg
3                                                      noo
4               ding dong kaeper gone ye friday good start
                               ...                        
49995                                                never
49996    think tosh best one left napier gone cristob g...
49997                     close could realli use tight end
49998                           yeah didnt realli end well
49999                                             fuck kid
Name: txt, Length: 50000, dtype: object

In [19]:
# create the feature matrix

cv = CountVectorizer(stop_words=stop)

# create target and sample

X = cv.fit_transform(df['txt'])
Y = df['cat']

# create train test split
# create train test split
# Splitting 75 25
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1)

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30), max_iter=75)
mlp.fit(X_train,y_train)



MLPClassifier(hidden_layer_sizes=(30, 30, 30), max_iter=75)

In [27]:
predictions = mlp.predict(X_test)
print('Confusion Matrix: ',confusion_matrix(y_test,predictions))  
print('Classification Report:',classification_report(y_test,predictions)) 
print('Accuracy: ',accuracy_score(y_test,predictions))

Confusion Matrix:  [[5478  887]
 [ 891 5244]]
Classification Report:                         precision    recall  f1-score   support

science_and_technology       0.86      0.86      0.86      6365
                sports       0.86      0.85      0.86      6135

              accuracy                           0.86     12500
             macro avg       0.86      0.86      0.86     12500
          weighted avg       0.86      0.86      0.86     12500

Accuracy:  0.85776
