In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', 100)

import re
import string
import nltk

from textblob import TextBlob, Word
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../../../data/reviews.csv")

col = ["review_id", "year", "title"]
df = df.drop(col, axis = 1)

df.head(10)

Unnamed: 0,user_review,user_suggestion
0,I'm scared and hearing creepy voices. So I'll pause for a moment and write a review while I wai...,1
1,"Best game, more better than Sam Pepper's YouTube account. 10/10What you'll need to play:A comput...",1
2,"A littly iffy on the controls, but once you know how to play, very easy to master. I've made it ...",1
3,"Great game, fun and colorful and all that.A side note, though: When are we getting windowed mode...",1
4,Not many games have the cute tag right next to the horror tag on Steam.I first played this game ...,1
5,"Early Access ReviewIt's pretty cute at first, but then later gets horrifying and it really does ...",1
6,Great game. it's a cute little horror game that progressively gets darker and scarier. It has a ...,1
7,Spooky's Jump Scare Mansion is a Free Retro maze game with jump scares and death. It worked on ...,1
8,"Somewhere between light hearted, happy parody and being afraid of the dark lies Spooky's House o...",0
9,This game with its cute little out of the wall pop-ups that scared the living light out of me I ...,1


In [3]:
stopwords = nltk.corpus.stopwords.words('english')
punct = string.punctuation

lemma = nltk.WordNetLemmatizer()

In [4]:
def clean_reviews(reviews):
    reviews = "".join([word for word in reviews if word not in string.punctuation])
    tokens = re.split('\W+', reviews)
    reviews = [lemma.lemmatize(word) for word in tokens if word not in stopwords]
    return reviews

In [5]:
df['clean_review'] = df['user_review'].apply(lambda x: clean_reviews(x.lower()))

df.head(10)

Unnamed: 0,user_review,user_suggestion,clean_review
0,I'm scared and hearing creepy voices. So I'll pause for a moment and write a review while I wai...,1,"[im, scared, hearing, creepy, voice, ill, pause, moment, write, review, wait, heart, beat, retur..."
1,"Best game, more better than Sam Pepper's YouTube account. 10/10What you'll need to play:A comput...",1,"[best, game, better, sam, pepper, youtube, account, 1010what, youll, need, playa, computersome, ..."
2,"A littly iffy on the controls, but once you know how to play, very easy to master. I've made it ...",1,"[littly, iffy, control, know, play, easy, master, ive, made, floor, 1000, due, certain, circumst..."
3,"Great game, fun and colorful and all that.A side note, though: When are we getting windowed mode...",1,"[great, game, fun, colorful, thata, side, note, though, getting, windowed, mode, computer, hate,..."
4,Not many games have the cute tag right next to the horror tag on Steam.I first played this game ...,1,"[many, game, cute, tag, right, next, horror, tag, steami, first, played, game, late, 2014, comin..."
5,"Early Access ReviewIt's pretty cute at first, but then later gets horrifying and it really does ...",1,"[early, access, reviewits, pretty, cute, first, later, get, horrifying, really, jumpscare, speci..."
6,Great game. it's a cute little horror game that progressively gets darker and scarier. It has a ...,1,"[great, game, cute, little, horror, game, progressively, get, darker, scarier, sense, humor, ive..."
7,Spooky's Jump Scare Mansion is a Free Retro maze game with jump scares and death. It worked on ...,1,"[spookys, jump, scare, mansion, free, retro, maze, game, jump, scare, death, worked, win, 10, lo..."
8,"Somewhere between light hearted, happy parody and being afraid of the dark lies Spooky's House o...",0,"[somewhere, light, hearted, happy, parody, afraid, dark, lie, spookys, house, jump, scare, taske..."
9,This game with its cute little out of the wall pop-ups that scared the living light out of me I ...,1,"[game, cute, little, wall, popups, scared, living, light, like, much, havent, played, lately, go..."


In [6]:
col = ["user_suggestion", "user_review"]
X = df.drop(col, axis=1)
y = df[["user_suggestion"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=57, test_size = 0.5)

In [7]:
new_df = pd.concat([X_train, y_train], axis=1)
new_df.head(10)

Unnamed: 0,clean_review,user_suggestion
1190,"[early, access, reviewalthough, game, still, alpha, fractured, space, lot, wonderful, thing, off...",1
2820,"[dont, bother, downloading, unless, ton, first, game, look, pretty, tier, 1, tier, 2, fun, resea...",0
8090,"[every, 23, minute, ingame, reminded, much, erection, give, devs, sink, little, money, game, hon...",0
12981,"[honestly, really, frustrating, feel, like, great, first, couple, hour, people, get, sad, get, l...",0
652,"[early, access, reviewi, dont, know, game, still, like, experience, purely, awful, average, fps,...",0
14915,"[early, access, reviewthis, game, possibly, one, worst, game, steam, huge, potential, good, pers...",0
379,"[many, clicker, game, steam, lol, started, review, question, goddamnit, well, choice, want, brin...",1
6358,"[early, access, reviewhard, reccomend, see, potenial, beta, year, feel, like, year, go, near, fi...",0
4729,"[playing, game, 200, hour, steam, ive, come, conclusion, enjoyable, moba, ive, ever, played, lit...",1
467,"[dont, know, clicker, hero, first, game, kind, certainly, first, consider, compare, game, itif, ...",0


In [8]:
new_df.to_csv("../../../data/first_half.csv", index=False)

In [9]:
other_df = pd.concat([X_test, y_test], axis=1)
other_df.head(10)

Unnamed: 0,clean_review,user_suggestion
6040,"[best, modern, combat, flight, sim, game, market, great, graphic, adjustable, difficulty, fairly...",1
12730,"[barebones, game, free2play, control, sticky, best, meaning, u, steer, one, way, ur, car, forced...",0
13933,"[worst, game, ing, planet, spend, minute, entering, server, get, kicked, reason, atter, second, ...",0
298,"[early, access, reviewhows, game, even, free, cool, game, expensive, preorder, exclusive, dlcs, ...",1
12153,"[early, access, reviewthird, attempt, apparently, wasnt, charm, gamethis, third, official, launc...",0
2261,"[even, need, write, review, fantastic, game, guess, everyone, else, written, one, im, level, 22,...",1
10804,"[early, access, reviewi, played, game, enough, simply, boil, 2, thing, either, see, passed, mana...",0
4764,"[game, really, fun, play, recommend, game, anyone, like, play, game, like, may, unbalanced, item...",1
3171,"[, 500, aircraft, ranging, he51, biplane, carrierborne, f4u, corsair, hawker, hunter, subsonic, ...",1
11294,"[early, access, reviewi, kinda, feeling, game, developer, salty, magic, player, game, bad, rulin...",0


In [10]:
other_df.to_csv("../../../data/second_half.csv", index=False)