# Predictive Analytics: Milestone 3
#### Joshua Greenert, Gabriel Avinaz, and Mithil Patel
#### DSC630-T301 Predictive Analytics
#### 12/26/2022

In [1]:
# Import the required libaries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [2]:
# Pull in the data.  Each person can have their own path.  Comment out the one that isn't yours if different.
# Josh G
df_steam_reviews = pd.read_csv('../../../../../Downloads/steam_reviews.csv', low_memory=False)
# Gabe A
# df_steam_reviews = pd.read_csv('steam_reviews.csv', low_memory=False)

In [3]:
# Show the head of the dataframe to confirm the data is present.
df_steam_reviews.head(5)

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,0,292030,The Witcher 3: Wild Hunt,85185598,schinese,不玩此生遗憾，RPG游戏里的天花板，太吸引人了,1611381629,1611381629,True,0,...,True,False,False,76561199095369542,6,2,1909.0,1448.0,1909.0,1611343000.0
1,1,292030,The Witcher 3: Wild Hunt,85185250,schinese,拔DIAO无情打桩机--杰洛特!!!,1611381030,1611381030,True,0,...,True,False,False,76561198949504115,30,10,2764.0,2743.0,2674.0,1611386000.0
2,2,292030,The Witcher 3: Wild Hunt,85185111,schinese,巫师3NB,1611380800,1611380800,True,0,...,True,False,False,76561199090098988,5,1,1061.0,1061.0,1060.0,1611384000.0
3,3,292030,The Witcher 3: Wild Hunt,85184605,english,"One of the best RPG's of all time, worthy of a...",1611379970,1611379970,True,0,...,True,False,False,76561199054755373,5,3,5587.0,3200.0,5524.0,1611384000.0
4,4,292030,The Witcher 3: Wild Hunt,85184287,schinese,大作,1611379427,1611379427,True,0,...,True,False,False,76561199028326951,7,4,217.0,42.0,217.0,1610788000.0


### Data Preparation

In [4]:
# Using the language column, we can remove all other languages besides english.
df_steam_reviews = df_steam_reviews[df_steam_reviews['language'] == 'english']

In [5]:
# Drop the columns that we don't need.
df_steam_reviews = df_steam_reviews.drop(['Unnamed: 0', 'review_id', 'language','author.num_games_owned', 'author.last_played' ], axis = 1)

In [6]:
# Make dummies of the columns that can conform.
df_reviews_dummies = pd.get_dummies(df_steam_reviews, columns=['recommended', 'steam_purchase', 'received_for_free',
                                                              'written_during_early_access'])

In [7]:
df_reviews_dummies.head(5)

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,votes_helpful,votes_funny,weighted_vote_score,comment_count,author.steamid,...,author.playtime_last_two_weeks,author.playtime_at_review,recommended_False,recommended_True,steam_purchase_False,steam_purchase_True,received_for_free_False,received_for_free_True,written_during_early_access_False,written_during_early_access_True
3,292030,The Witcher 3: Wild Hunt,"One of the best RPG's of all time, worthy of a...",1611379970,1611379970,0,0,0.0,0,76561199054755373,...,3200.0,5524.0,0,1,0,1,1,0,1,0
5,292030,The Witcher 3: Wild Hunt,"good story, good graphics. lots to do.",1611379264,1611379264,0,0,0.0,0,76561198170193529,...,823.0,823.0,0,1,0,1,1,0,1,0
6,292030,The Witcher 3: Wild Hunt,"dis gud,",1611379091,1611379091,0,0,0.0,0,76561198119302812,...,3398.0,4192.0,0,1,0,1,1,0,1,0
18,292030,The Witcher 3: Wild Hunt,favorite game of all time cant wait for the Ne...,1611373086,1611373086,0,0,0.0,0,76561198065591528,...,177.0,23329.0,0,1,0,1,1,0,1,0
20,292030,The Witcher 3: Wild Hunt,Why wouldn't you get this,1611371978,1611371978,0,0,0.0,0,76561198996835044,...,2004.0,8557.0,0,1,0,1,1,0,1,0


In [8]:
# Create the lemmatizer, porter, and codes.
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

# Define a function that processes the lemmatizing on each sentence.
def process_sentence_lemmatize(text):
    # changing to lower case, tokenizing, and lemmatizing each of the descriptions
    text = str(text).lower()
    temp_sent =[]
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    
    # For loop to remove verb portion of words
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES: 
            lemmatized = lemmatizer.lemmatize(word, 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)
            
    # joining tokenized words and removing contractions
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    
    return finalsent

# Define a function that performs additional preparation by replacing contractions with full words.
def process_sentence_stemm(text):
    # changing to lower case, tokenizing, and stemm each of the descriptions
    text = str(text).lower()
    temp_sent =[]
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    for word in words:
        words = porter.stem(word) 
    words = ' '.join([word for word in words])

    # joining tokenized words and removing contractions
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    return finalsent

In [10]:
# Download the required libraries.
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...


True

## WARNING: This lemmatization takes up to 6 hours to complete.

In [None]:
# Calling the process_sentence_lemmatize function through the apply method.
df_reviews_dummies['prepped_review_lemm'] = df_reviews_dummies.review.apply(process_sentence_lemmatize)
df_reviews_dummies.sample(10)

## Checkpoint File:  File present to ensure you don't have to rerun the 6 hour code above.

In [None]:
df_reviews_dummies.to_csv('Prepped_test_out.csv', index=False)

## New Start Point: 

In [None]:
# Pull in the dataframe.
# Josh G
df_reviews_dummies = pd.read_csv('../../../../../Downloads/Prepped_test_out.csv', low_memory=False)

# DONT RUN

In [8]:
import sys
print(sys.getrecursionlimit())

sys.setrecursionlimit(5000)

5000


In [None]:
df_reviews_dummies['prepped_review_stemm'] = df_reviews_dummies.review.apply(process_sentence_stemm)
df_reviews_dummies.sample(10)

----------------------------------------------------------------------

In [10]:
df_reviews_dummies.to_csv('Prepped_test_out.csv', index=False)

In [6]:
tfidfvec = TfidfVectorizer()

tfidf_reviews = tfidfvec.fit_transform((df_reviews_dummies['prepped_review_lemm'].values.astype('U')))

MemoryError: Unable to allocate 306. GiB for an array with shape (9635437,) and data type <U8519