In [1]:
import numpy as np
import pandas as pd
import data_utils
import re
import spacy
from tqdm import tqdm
import string
import ast

In [1]:
"""Notebook to clean data:
* Train
* Overview
* Test
It doesnot remove stop words. Removes non ascii chars"""

'Notebook to clean data:\n* Train\n* Overview\n* Test\nIt doesnot remove stop words. Removes non ascii chars'

In [300]:
from collections import Counter
import matplotlib.pyplot as plt
nlp = spacy.load("en_core_web_sm")
len(nlp.Defaults.stop_words)

326

In [26]:
stop_words = nlp.Defaults.stop_words

In [2]:
train = pd.read_csv("train_data/train.csv")
overview = pd.read_csv("train_data/game_overview.csv")
corpus = list(train['user_review'].values) + list(overview['overview'].values)

In [198]:
print("Train Data Shape ",train.shape)
print("Overview Shape ", overview.shape)
print("17494 + 64 = ", len(corpus))

Train Data Shape  (17494, 5)
Overview Shape  (64, 5)
17494 + 64 =  17558


In [105]:
print("CORPUS LENGTH : ", str(len(corpus)))

CORPUS LENGTH :  17558


In [169]:
text_sent = [data_utils.expand_contractions(sent) for sent in corpus]
text_sent = [data_utils.remove_accented_chars(sent) for sent in text_sent]
text_sent = [re.sub("[-!\"#$%&'()*+,./:;<=>?@\][^_`|}{~']"," ",text) for text in text_sent]
text_sent = [text.replace("\\"," ") for text in text_sent]
text_sent = [re.sub(r'\s+', ' ',sent) for sent in text_sent] #Removing extra spaces but won't clean front of the row
text_sent = list(map(str.lower,text_sent))
text_sent[0]

'i am scared and hearing creepy voices so i will pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times this game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood but more bubble and clean hello 1990s what charactes there are that isnot trying to kill me were likable and a bit odd i did do a few noob things though such as oh look a class room full of ghosts from dead children lets shine my flashlight on them and stand there staring at them or hmm creepy music i will turn around and see if i can see what is chasing me never before in a game have i been this afraid of finding a locked door '

In [174]:
#Let's check the words with punctuation in them
punctuations = set(string.punctuation)
words_with_punctuation = list(filter(lambda x: set(x).intersection(punctuations), list(map(lambda x:x[0], words_counter.most_common()))))
len(words_with_punctuation) #Count of words having punctuation in them
words_with_punctuation

[]

In [170]:
#Creating word count for further operations
words = [t for text in text_sent for t in text.split()]
words_counter = Counter(words)

In [171]:
words_count = list(map(lambda x:x[1], words_counter.most_common()[::-1]))
words_count_counter = Counter(words_count)
words_count_counter.most_common(10) #words that have appeared only once and the count of such words -> words that have occured
#only 1 time have occured 93k times

[(1, 30813),
 (2, 6362),
 (3, 3021),
 (4, 1830),
 (5, 1243),
 (6, 897),
 (7, 740),
 (8, 566),
 (9, 521),
 (10, 462)]

In [186]:
docs = nlp.pipe(text_sent)

In [187]:
cleaned_text = []
pbar = tqdm(total=len(text_sent))
for i,doc in enumerate(docs):
    tokens = []
    for token in doc:
        if token.like_url or token.like_email:
            pass
        else:
            tokens.append(token.text)
    cleaned_text.append(tokens)
    if i%500 == 0 and i != 0:
        pbar.update(500)
pbar.close()

100%|█████████▉| 17500/17558 [05:07<00:01, 56.94it/s]


In [201]:
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [211]:
cleaned_train = train.copy()
cleaned_overview = overview.copy()

In [212]:
cleaned_train['user_review'] =  [" ".join(text) for text in cleaned_text][:-64]
cleaned_overview['overview'] = [" ".join(text) for text in cleaned_text][-64:]

In [280]:
cleaned_train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,i am scared and hearing creepy voices so i wil...,1
1,2,Spooky's Jump Scare Mansion,2016.0,best game more better than sam peppers youtube...,1
2,3,Spooky's Jump Scare Mansion,2016.0,a littly iffy on the controls but once you kno...,1
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun and colorful and all that a sid...,1
4,5,Spooky's Jump Scare Mansion,2015.0,not many games have the cute tag right next to...,1


In [279]:
#Let's cleans tags as well
# tags = cleaned_overview['tags'].values
# tags = [ast.literal_eval(tag) for tag in tags]
# tags = [" ".join(tag) for tag in tags]
# cleaned_overview['tags'] = tags

In [248]:
cleaned_overview.head()

Unnamed: 0,title,developer,publisher,tags,overview
0,Spooky's Jump Scare Mansion,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
1,Sakura Clicker,Winged Cloud,Winged Cloud,Nudity Anime Free to Play Mature Sexual Conten...,the latest entry in the sakura series is more ...
2,WARMODE,WARTEAM,WARTEAM,Early Access Free to Play FPS Multiplayer Shoo...,free to play shooter about the confrontation o...
3,Fractured Space,Edge Case Games Ltd.,Edge Case Games Ltd.,Space Multiplayer Free to Play PvP MOBA Action...,take the helm of a gigantic capital ship and g...
4,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,FPS Multiplayer Shooter Action Team-Based Comp...,counter strike global offensive cs go expands ...


In [257]:
games_data = pd.merge(cleaned_train,cleaned_overview,left_on='title',right_on='title')
games_data.to_csv("proper_cleaned_data/clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv",index=False)

In [281]:
games_data.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
0,1,Spooky's Jump Scare Mansion,2016.0,i am scared and hearing creepy voices so i wil...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
1,2,Spooky's Jump Scare Mansion,2016.0,best game more better than sam peppers youtube...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
2,3,Spooky's Jump Scare Mansion,2016.0,a littly iffy on the controls but once you kno...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun and colorful and all that a sid...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
4,5,Spooky's Jump Scare Mansion,2015.0,not many games have the cute tag right next to...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...


In [293]:
games_data[games_data['user_review']== '']

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
215,216,Spooky's Jump Scare Mansion,2017.0,,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
2777,3241,War Thunder,2018.0,,1,Gaijin Entertainment,Gaijin Entertainment,Free to Play World War II Multiplayer Simulati...,war thunder is the most comprehensive free to ...
9515,13099,Realm of the Mad God,2017.0,,0,"Wild Shadow Studios, Deca Games",Deca Games,Free to Play Massively Multiplayer Pixel Graph...,realm of the mad god is the first ever free to...
12868,18675,Fallout Shelter,2017.0,,1,Bethesda Game Studios,Bethesda Softworks,Free to Play Survival Base Building Post-apoca...,fallout shelter puts you in control of a state...
14342,20669,Shop Heroes,2016.0,,1,"Cloudcade, Inc.","Cloudcade, Inc.",Free to Play Simulation Casual Strategy RPG Ma...,craft legendary items gear up your heroes for ...
16690,24659,Bloons TD Battles,2017.0,,1,Ninja Kiwi,Ninja Kiwi,Free to Play Tower Defense Multiplayer Strateg...,play the top rated tower defense franchise in ...


In [294]:
games_data[games_data['user_review']== ' ']

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
190,191,Spooky's Jump Scare Mansion,2017.0,,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
438,439,Sakura Clicker,2015.0,,1,Winged Cloud,Winged Cloud,Nudity Anime Free to Play Mature Sexual Conten...,the latest entry in the sakura series is more ...
1672,2136,Dota 2,2014.0,,1,Valve,Valve,Free to Play MOBA Strategy Multiplayer Team-Ba...,the most played game on steam every day millio...
1729,2193,Dota 2,2014.0,,1,Valve,Valve,Free to Play MOBA Strategy Multiplayer Team-Ba...,the most played game on steam every day millio...
1899,2363,Dota 2,2016.0,,1,Valve,Valve,Free to Play MOBA Strategy Multiplayer Team-Ba...,the most played game on steam every day millio...
3662,4126,Team Fortress 2,2015.0,,1,Valve,Valve,Free to Play Multiplayer FPS Action Shooter Cl...,the most fun you can have online pc gameris ...
3869,4333,Team Fortress 2,2016.0,,1,Valve,Valve,Free to Play Multiplayer FPS Action Shooter Cl...,the most fun you can have online pc gameris ...
5587,7512,World of Tanks Blitz,2017.0,,1,Wargaming Group Limited,Wargaming Group Limited,Tanks Free to Play Multiplayer Action World Wa...,world of tanks blitz is a cross platform free ...
5860,8235,DCS World Steam Edition,2016.0,,1,Eagle Dynamics SA,The Fighter Collection,Simulation Flight Free to Play Military Multip...,feel the excitement of flying the su 25 t frog...
6414,8789,Heroes & Generals,2017.0,,0,RETO MOTO,RETO MOTO,Free to Play World War II Multiplayer FPS War ...,heroes generals is a full on all out war exper...


## Test Cleaning

In [304]:
def test_cleaning(df):
    overview = pd.read_csv("train_data/game_overview.csv")
    corpus = list(df['user_review'].values) + list(overview['overview'].values)
    print("File Data Shape ",df.shape)
    print("Overview Shape ", overview.shape)
    print("CORPUS LENGTH : ", str(len(corpus)))
    text_sent = [data_utils.expand_contractions(sent) for sent in corpus]
    text_sent = [data_utils.remove_accented_chars(sent) for sent in text_sent]
    text_sent = [re.sub("[-!\"#$%&'()*+,./:;<=>?@\][^_`|}{~']"," ",text) for text in text_sent]
    text_sent = [text.replace("\\"," ") for text in text_sent]
    text_sent = [re.sub(r'\s+', ' ',sent) for sent in text_sent] #Removing extra spaces but won't clean front of the row
    text_sent = list(map(str.lower,text_sent))
    
    docs = nlp.pipe(text_sent)
    
    cleaned_text = []
    pbar = tqdm(total=len(text_sent))
    for i,doc in enumerate(docs):
        tokens = []
        for token in doc:
            if token.like_url or token.like_email:
                pass
            else:
                tokens.append(token.text)
        cleaned_text.append(tokens)
        if i%500 == 0 and i != 0:
            pbar.update(500)
    pbar.close()
    
    df['user_review'] =  [" ".join(text) for text in cleaned_text][:-64]
    overview['overview'] = [" ".join(text) for text in cleaned_text][-64:]
    df_data = pd.merge(df,overview,left_on='title',right_on='title')
    df_data.to_csv("proper_cleaned_data/test_clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv",index=False)

In [306]:
df = pd.read_csv("test_data/test.csv")
test_cleaning(df)

File Data Shape  (8045, 4)
Overview Shape  (64, 5)
CORPUS LENGTH :  8109


 99%|█████████▊| 8000/8109 [01:49<00:01, 73.13it/s]


In [2]:
train = pd.read_csv("proper_cleaned_data/clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv")