# Predictive Analytics: Milestone 3
#### Joshua Greenert, Gabriel Avinaz, and Mithil Patel
#### DSC630-T301 Predictive Analytics
#### 12/26/2022

In [3]:
# Import the required libaries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [2]:
# Pull in the data.  Each person can have their own path.  Comment out the one that isn't yours if different.
# Josh G
# df_steam_reviews = pd.read_csv('../../../../../Downloads/steam_reviews.csv')
# Gabe A
df_steam_reviews = pd.read_csv('steam_reviews.csv', low_memory=False)

In [3]:
# Show the head of the dataframe to confirm the data is present.
df_steam_reviews.head(5)

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,0,292030,The Witcher 3: Wild Hunt,85185598,schinese,不玩此生遗憾，RPG游戏里的天花板，太吸引人了,1611381629,1611381629,True,0,...,True,False,False,76561199095369542,6,2,1909.0,1448.0,1909.0,1611343000.0
1,1,292030,The Witcher 3: Wild Hunt,85185250,schinese,拔DIAO无情打桩机--杰洛特!!!,1611381030,1611381030,True,0,...,True,False,False,76561198949504115,30,10,2764.0,2743.0,2674.0,1611386000.0
2,2,292030,The Witcher 3: Wild Hunt,85185111,schinese,巫师3NB,1611380800,1611380800,True,0,...,True,False,False,76561199090098988,5,1,1061.0,1061.0,1060.0,1611384000.0
3,3,292030,The Witcher 3: Wild Hunt,85184605,english,"One of the best RPG's of all time, worthy of a...",1611379970,1611379970,True,0,...,True,False,False,76561199054755373,5,3,5587.0,3200.0,5524.0,1611384000.0
4,4,292030,The Witcher 3: Wild Hunt,85184287,schinese,大作,1611379427,1611379427,True,0,...,True,False,False,76561199028326951,7,4,217.0,42.0,217.0,1610788000.0


### Data Preparation

In [4]:
# Using the language column, we can remove all other languages besides english.
df_steam_reviews = df_steam_reviews[df_steam_reviews['language'] == 'english']

In [5]:
# Drop the columns that we don't need.
df_steam_reviews = df_steam_reviews.drop(['Unnamed: 0', 'review_id', 'language','author.num_games_owned', 'author.last_played' ], axis = 1)

In [6]:
# Make dummies of the columns that can conform.
df_reviews_dummies = pd.get_dummies(df_steam_reviews, columns=['recommended', 'steam_purchase', 'received_for_free',
                                                              'written_during_early_access'])

In [7]:
df_reviews_dummies.head(5)

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,votes_helpful,votes_funny,weighted_vote_score,comment_count,author.steamid,...,author.playtime_last_two_weeks,author.playtime_at_review,recommended_False,recommended_True,steam_purchase_False,steam_purchase_True,received_for_free_False,received_for_free_True,written_during_early_access_False,written_during_early_access_True
3,292030,The Witcher 3: Wild Hunt,"One of the best RPG's of all time, worthy of a...",1611379970,1611379970,0,0,0.0,0,76561199054755373,...,3200.0,5524.0,0,1,0,1,1,0,1,0
5,292030,The Witcher 3: Wild Hunt,"good story, good graphics. lots to do.",1611379264,1611379264,0,0,0.0,0,76561198170193529,...,823.0,823.0,0,1,0,1,1,0,1,0
6,292030,The Witcher 3: Wild Hunt,"dis gud,",1611379091,1611379091,0,0,0.0,0,76561198119302812,...,3398.0,4192.0,0,1,0,1,1,0,1,0
18,292030,The Witcher 3: Wild Hunt,favorite game of all time cant wait for the Ne...,1611373086,1611373086,0,0,0.0,0,76561198065591528,...,177.0,23329.0,0,1,0,1,1,0,1,0
20,292030,The Witcher 3: Wild Hunt,Why wouldn't you get this,1611371978,1611371978,0,0,0.0,0,76561198996835044,...,2004.0,8557.0,0,1,0,1,1,0,1,0


In [2]:
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

def process_sentence_lemmatize(text):
    # changing to lower case, tokenizing, and lemmatizing each fo teh descriptions
    text = str(text).lower()
    temp_sent =[]
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES: 
            lemmatized = lemmatizer.lemmatize(word, 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)
    # joining tokenized words and removing contractions
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    return finalsent

def process_sentence_stemm(text):
# changing to lower case, tokenizing, and stemm each fo teh descriptions
    text = str(text).lower()
    temp_sent =[]
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    for word in words:
        words = porter.stem(word) 
    words = ' '.join([word for word in words])

    # joining tokenized words and removing contractions
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    return finalsent

## This takes 6 hours

In [9]:
df_reviews_dummies['prepped_review_lemm'] = df_reviews_dummies.review.apply(process_sentence_lemmatize)
df_reviews_dummies.sample(10)

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,votes_helpful,votes_funny,weighted_vote_score,comment_count,author.steamid,...,author.playtime_at_review,recommended_False,recommended_True,steam_purchase_False,steam_purchase_True,received_for_free_False,received_for_free_True,written_during_early_access_False,written_during_early_access_True,prepped_review_lemm
8218311,1145360,Hades,Reviewing now that I have the appropriate amou...,1606369651,1606369651,0,0,0.0,0,76561198048104819,...,4155.0,0,1,0,1,1,0,1,0,review appropriate amount hour play game right...
4055867,646570,Slay the Spire,This game is fun to be bad at.\nIf you're good...,1574873305,1574873305,0,0,0.0,0,76561198047293459,...,345.0,0,1,0,1,1,0,1,0,game fun bad good bet even better
9488630,205100,Dishonored,This is one of the best games i've ever played...,1417106149,1417106149,0,0,0.0,0,76561198108145398,...,4844.0,0,1,0,1,1,0,1,0,one best game ever play maybe even best really...
8914836,698780,Doki Doki Literature Club,Bitch why does it take 80 years to get to the ...,1512507971,1512507971,0,0,0.0,0,76561198163486395,...,173.0,0,1,1,0,1,0,1,0,bitch take year get scary part smh man game aw...
3723523,367520,Hollow Knight,A true gem in the world of video games. Beauti...,1523734498,1523734498,0,0,0.0,0,76561198105228881,...,3558.0,0,1,0,1,1,0,1,0,true gem world video game beautiful challenge ...
13285339,271590,Grand Theft Auto V,"Chill game, lots of fun, easy to play, slightl...",1562243522,1562243522,0,0,0.0,0,76561198127722097,...,1562.0,0,1,0,1,1,0,1,0,chill game lot fun easy play slightly graphic ...
2782072,732810,Slipstream,It takes a bit of practice to stay on the road...,1527316208,1527316208,2,0,0.505495,0,76561198139158591,...,401.0,0,1,0,1,1,0,1,0,take bit practice stay road get hang driving f...
10679606,526870,Satisfactory,factory,1596319623,1596319623,0,0,0.0,0,76561198035374176,...,3712.0,0,1,0,1,1,0,0,1,factory
7089760,945360,Among Us,- game concept has been done many times before...,1600492010,1600492010,0,0,0.47619,0,76561198180392223,...,292.0,1,0,0,1,1,0,1,0,game concept many time ttt town salem cluedo e...
8543122,1289310,Helltaker,Amazing game for people who like to slowly kil...,1595458098,1595458182,0,0,0.0,0,76561198140839080,...,117.0,0,1,1,0,1,0,1,0,amazing game people like slowly kill sanity re...


In [3]:
type(df_reviews_dummies.review)

NameError: name 'df_reviews_dummies' is not defined

In [2]:
df_reviews_dummies = pd.read_csv('Prepped_test_out.csv', low_memory=False)

# DONT RUN

In [8]:
import sys
print(sys.getrecursionlimit())

sys.setrecursionlimit(5000)

5000


In [None]:
df_reviews_dummies['prepped_review_stemm'] = df_reviews_dummies.review.apply(process_sentence_stemm)
df_reviews_dummies.sample(10)

----------------------------------------------------------------------

In [10]:
df_reviews_dummies.to_csv('Prepped_test_out.csv', index=False)

In [6]:
tfidfvec = TfidfVectorizer()

tfidf_reviews = tfidfvec.fit_transform((df_reviews_dummies['prepped_review_lemm'].values.astype('U')))

MemoryError: Unable to allocate 306. GiB for an array with shape (9635437,) and data type <U8519