In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Combining Comments and Posts

In [2]:
df = pd.read_csv('../data/cleanposts.csv')
df2 = pd.read_csv('../data/cleancomments.csv')

In [3]:
bigdf = pd.concat([df, df2], axis=0)

bigdf['type'].value_counts()

Comments    4000
Posts       2000
Name: type, dtype: int64

In [4]:
bigdf.reset_index(drop=True)
bigdf.shape

(6000, 7)

In [5]:
bigdf.to_csv('../data/fullcleandata.csv', index=False)

In [6]:
#convert subreddit to targets, and true/false to binary

bigdf['stardew_keywords'] = bigdf['stardew_keywords'].map({True: 1, False : 0})
bigdf['elden_keywords'] = bigdf['elden_keywords'].map({True:1, False:0})
bigdf['subreddit']=bigdf['subreddit'].map({'StardewValley':1, 'Eldenring':0})

bigdf.head(3)

Unnamed: 0,subreddit,body,lemmatized,stemmed,stardew_keywords,elden_keywords,type
0,0,It is almost time my fellow hollows,It is almost time my fellow hollow,it is almost time my fellow hollow,0,0,Posts
1,0,24 Hours yet remain,24 Hours yet remain,24 hour yet remain,0,0,Posts
2,0,Your worst nightmares,Your worst nightmare,your worst nightmar,0,0,Posts


In [7]:
bigdf.dropna(inplace=True)
bigdf.isna().sum()

subreddit           0
body                0
lemmatized          0
stemmed             0
stardew_keywords    0
elden_keywords      0
type                0
dtype: int64

In [8]:
bigdf.to_csv('../data/modelreadydata.csv', index=False)

### Prep Overlapping Words
- List to add more custom stopwords

In [9]:
df3 = pd.read_csv('../data/overlappingwords.csv')
df4 = pd.read_csv('../data/overlappingwords2.csv')

In [10]:
stopdf = pd.concat([df3, df4], axis=0)

stopdf.head(3)

Unnamed: 0,word,count
0,day,108
1,farm,100
2,got,83


In [11]:
stopdf.shape

(642, 2)

In [12]:
stopdf.drop_duplicates(subset=['word'], inplace=True)

stopdf.shape

(444, 2)

In [13]:
stoplist = list(stopdf['word'])

stoplist

['day',
 'farm',
 'got',
 'love',
 'lol',
 'way',
 'use',
 'thank',
 'thanks',
 'right',
 'lot',
 'looks',
 'people',
 'try',
 'days',
 'year',
 'yeah',
 'havent',
 'feel',
 'things',
 'stuff',
 'doesnt',
 'look',
 'going',
 'little',
 'work',
 'level',
 'id',
 'probably',
 'great',
 'getting',
 'better',
 'yes',
 'best',
 'makes',
 'maybe',
 'buy',
 'ill',
 'pretty',
 'fun',
 'oh',
 'run',
 'help',
 'luck',
 'start',
 'haha',
 'items',
 'theyre',
 'say',
 'doing',
 'save',
 'having',
 'long',
 '100',
 'used',
 'bad',
 'hes',
 'super',
 'check',
 'area',
 'definitely',
 'wait',
 'away',
 'cool',
 'quality',
 'years',
 'nice',
 'started',
 'usually',
 'isnt',
 'youll',
 'bit',
 'chance',
 'post',
 'far',
 'coop',
 'hours',
 'able',
 'switch',
 'event',
 'high',
 'wanted',
 'removed',
 'trying',
 'favorite',
 'hard',
 'literally',
 'possible',
 'tried',
 'wont',
 'place',
 'change',
 'using',
 'worth',
 'ok',
 'youve',
 'drop',
 'seen',
 'single',
 'version',
 'tell',
 'amazing',
 'playe

In [14]:
custom_stopwords = ['elden',
                    'ring',
                    'stardew',
                    'valley',
                    'just',
                    'like',
                    'think',
                    'game',
                    'games',
                   'im',
                   'dont',
                   'ive',
                   'thats',
                   'youre',
                    'play',
                    'playing',
                    'time',
                    'think',
                    'thought',
                   'make',
                   'new',
                   'did',
                   'does',
                   'really',
                   'actually',
                   'sure',
                   'thing',
                   'good',
                   'know',
                   'need',
                   'want',
                   'theres',
                   'didnt',
                    'got',
                    'maybe',
                    'probably',
                    'say',
                    'said',
                    'doesnt',
                    'way',
                    'things',
                   'stuff']

In [15]:
for x in custom_stopwords:
    stoplist.append(x)

In [16]:
stoplist

['day',
 'farm',
 'got',
 'love',
 'lol',
 'way',
 'use',
 'thank',
 'thanks',
 'right',
 'lot',
 'looks',
 'people',
 'try',
 'days',
 'year',
 'yeah',
 'havent',
 'feel',
 'things',
 'stuff',
 'doesnt',
 'look',
 'going',
 'little',
 'work',
 'level',
 'id',
 'probably',
 'great',
 'getting',
 'better',
 'yes',
 'best',
 'makes',
 'maybe',
 'buy',
 'ill',
 'pretty',
 'fun',
 'oh',
 'run',
 'help',
 'luck',
 'start',
 'haha',
 'items',
 'theyre',
 'say',
 'doing',
 'save',
 'having',
 'long',
 '100',
 'used',
 'bad',
 'hes',
 'super',
 'check',
 'area',
 'definitely',
 'wait',
 'away',
 'cool',
 'quality',
 'years',
 'nice',
 'started',
 'usually',
 'isnt',
 'youll',
 'bit',
 'chance',
 'post',
 'far',
 'coop',
 'hours',
 'able',
 'switch',
 'event',
 'high',
 'wanted',
 'removed',
 'trying',
 'favorite',
 'hard',
 'literally',
 'possible',
 'tried',
 'wont',
 'place',
 'change',
 'using',
 'worth',
 'ok',
 'youve',
 'drop',
 'seen',
 'single',
 'version',
 'tell',
 'amazing',
 'playe

In [17]:
stoplistdf = pd.DataFrame(stoplist, columns = ['words'])
stoplistdf.shape

(486, 1)

In [18]:
stoplistdf.to_csv('../data/stopwords.csv', index=False)