In [58]:
import pandas as pd
import gzip
import json
import re
import os
import pickle
import nltk
import re

## setting up dataframe for manipulation

In [2]:
# assign paths for csv data

reviews_path = os.path.join(os.pardir, os.pardir, 'data/reviews.csv')
games_path = os.path.join(os.pardir, os.pardir, 'data/games.csv')

In [3]:
# create dataframes from csv files

df_reviews = pd.read_csv(reviews_path)
df_games = pd.read_csv(games_path)

In [4]:
# drop unnecessary columns

df_reviews.drop(['found_funny', 'compensation', 'user_id', 'Unnamed: 0', 'products', 'page_order',\
                'date', 'early_access', 'page'], axis=1, inplace=True)

In [5]:
# create a frequency column based on product_id, sort by said column

df_reviews['freq'] = df_reviews.groupby('product_id')['product_id'].transform('count')
df_reviews.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

In [6]:
# remove null values

df_reviews.dropna(inplace=True)

In [7]:
# remove reviews by users that had under 1 hour played for the game
# remove games that have less than 500 total reviews

df_reviews = df_reviews[df_reviews['hours'] >= 1]
df_reviews = df_reviews[df_reviews['freq'] >= 500]

In [8]:
# take subsample of data for text manipulation/modeling purposes

df_sample = df_reviews.sample(axis=0, n=250000)
df_sample.sort_values(by=['freq', 'product_id'], ascending=[False, True], inplace=True)

## manipulation trials

In [35]:
# make lowercase

df_sample['text'] = df_sample['text'].str.lower()

In [34]:
# remove new line indicators

df_sample['text'] = df_sample['text'].str.replace('\n', ' ')
df_sample['text'] = df_sample['text'].str.replace('.\n', ' ')

## make small dataframe for testing

In [98]:
df_test = df_sample[:5]
df_test

Unnamed: 0,username,hours,product_id,text,freq
3812077,Pancakes,606.2,440,exellent game that everyone and their mother s...,183666
3922485,Nope,280.2,440,"it's fun, i'd recommend.",183666
3960578,Jacques,1096.8,440,this is the best free to play fps on the market.,183666
3949884,mintoochahal,704.2,440,yes i love it,183666
3948983,Treven the Meme King,128.8,440,it's free. 10/10,183666


In [74]:
# intended to remove punctuation
# issue is that it doesn't save

for index, row in df_test.iterrows():
    row['text'] = re.sub(r'[^\w\s]', '', row['text'])

Unnamed: 0,username,hours,product_id,text,freq,tokens,stopped
3812077,Pancakes,606.2,440,exellent game that everyone and their mother s...,183666,"[exellent, game, that, everyone, and, their, m...","[exellent, game, that, everyone, and, their, m..."
3922485,Nope,280.2,440,"it's fun, i'd recommend.",183666,"[it, 's, fun, ,, i, 'd, recommend, .]","[it, 's, fun, ,, i, 'd, recommend, .]"
3960578,Jacques,1096.8,440,this is the best free to play fps on the market.,183666,"[this, is, the, best, free, to, play, fps, on,...","[this, is, the, best, free, to, play, fps, on,..."
3949884,mintoochahal,704.2,440,yes i love it,183666,"[yes, i, love, it]","[yes, i, love, it]"
3948983,Treven the Meme King,128.8,440,it's free. 10/10,183666,"[it, 's, free, ., 10/10]","[it, 's, free, ., 10/10]"


In [53]:
from nltk.corpus import stopwords

nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package words to
[nltk_data]     /Users/kyledecember1/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kyledecember1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [99]:
stop_words = stopwords.words('english')

In [104]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [107]:
df_test['tokens'] = df_test['text'].apply(nltk.word_tokenize)
df_test['tokens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


3812077    [exellent, game, that, everyone, and, their, m...
3922485                [it, 's, fun, ,, i, 'd, recommend, .]
3960578    [this, is, the, best, free, to, play, fps, on,...
3949884                                   [yes, i, love, it]
3948983                             [it, 's, free, ., 10/10]
Name: tokens, dtype: object

In [105]:
df_test['stopped'] = [word for word in df_test['tokens'] if word not in stop_words]  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [106]:
df_test['stopped']

3812077    [exellent, game, that, everyone, and, their, m...
3922485                [it, 's, fun, ,, i, 'd, recommend, .]
3960578    [this, is, the, best, free, to, play, fps, on,...
3949884                                   [yes, i, love, it]
3948983                             [it, 's, free, ., 10/10]
Name: stopped, dtype: object

In [108]:
# join stopped lists into one string

df_test['stopped'] = df_test['stopped'].apply(', '.join)
df_test['stopped']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


3812077    exellent, game, that, everyone, and, their, mo...
3922485                  it, 's, fun, ,, i, 'd, recommend, .
3960578    this, is, the, best, free, to, play, fps, on, ...
3949884                                     yes, i, love, it
3948983                               it, 's, free, ., 10/10
Name: stopped, dtype: object

# further requirements
    - remove punctuation
    - determine why stopped words are not being removed

### attempts to remove punctuation

In [None]:
# attempt 1

df_test['stopped'].apply(re.sub(r'[^\w\s]', '', df_test['stopped']))

In [None]:
# attempt 2

for index, row in df_test.iterrows():
    row['text'] = re.sub(r'[^\w\s]', '', row['text'])

In [None]:
# attempt 3

re.sub(r'[^\w\s]', '', df_test['text'].iloc[0])