# Playstyle column clean

In [7]:
#Import dependencies
import pandas as pd
import os
import numpy as np
import scipy.stats as stats
import re

In [10]:
#Import csv where we cleaned NAs
cleaned_df = pd.read_csv("cleaned_data.csv", encoding = "ISO-8859-1")

cleaned_df.head()

Unnamed: 0,S. No.,Timestamp,GADE,Game,Platform,Hours,earnings,whyplay,League,streams,...,Degree,Birthplace,Residence,Playstyle,accept,GAD_T,SWL_T,SPIN_T,Residence_ISO3,Birthplace_ISO3
0,1,42052.00437,Not difficult at all,Skyrim,"Console (PS, Xbox, ...)",15.0,I play for fun,having fun,,0.0,...,Bachelor (or equivalent),USA,USA,Singleplayer,Accept,1,23,5.0,USA,USA
1,2,42052.0068,Somewhat difficult,Other,PC,8.0,I play for fun,having fun,,2.0,...,Bachelor (or equivalent),USA,USA,Multiplayer - online - with strangers,Accept,8,16,33.0,USA,USA
2,3,42052.0386,Not difficult at all,Other,PC,0.0,I play for fun,having fun,,0.0,...,Bachelor (or equivalent),Germany,Germany,Singleplayer,Accept,8,17,31.0,DEU,DEU
3,4,42052.06804,Not difficult at all,Other,PC,20.0,I play for fun,improving,,5.0,...,Bachelor (or equivalent),USA,USA,Multiplayer - online - with online acquaintanc...,Accept,0,17,11.0,USA,USA
4,5,42052.08948,Very difficult,Other,"Console (PS, Xbox, ...)",20.0,I play for fun,having fun,,1.0,...,High school diploma (or equivalent),USA,South Korea,Multiplayer - online - with strangers,Accept,14,14,13.0,KOR,USA


In [11]:
print(len(cleaned_df))

11682


In [4]:
#Expand view of dataframe when printing
pd.set_option('display.max_rows', None)

#List all values in Playstyle column
cleaned_df["Playstyle"].value_counts()

Multiplayer - online - with real life friends                                                                                                                4860
Multiplayer - online - with strangers                                                                                                                        3567
Multiplayer - online - with online acquaintances or teammates                                                                                                2300
Singleplayer                                                                                                                                                  663
Multiplayer - offline (people in the same room)                                                                                                                38
all of the above                                                                                                                                                7
All of the above            

Original choices from the survey:
* Singleplayer
* Multiplayer - offline (people in same room)
* Multiplayer - online with strangers
* Multiplayer - online with acquaintances or teammates
* Multiplayer - online with real life friends

Will be adding the following options:
* All of the above
* Other

In [5]:
# Will be grouping all multiplayer options together, all singleplayer options together, and anything else will be added to
#'All of the above' or 'Other'

#If answer in string contains "singleplayer", replace entire value with "Singleplayer"
cleaned_df.replace(re.compile('.*singleplayer.*', re.IGNORECASE), 'Singleplayer', inplace=True,)
cleaned_df.replace(re.compile('.*single player.*', re.IGNORECASE), 'Singleplayer', inplace=True,)
cleaned_df.replace(re.compile('.*alone.*', re.IGNORECASE), 'Singleplayer', inplace=True,)
cleaned_df.replace(re.compile('.*solo.*', re.IGNORECASE), 'Singleplayer', inplace=True,)

#If the answer string contains the term "Multiplayer", replace entire value with Multiplayer
cleaned_df.replace(re.compile('.*multiplayer.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*duo.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*friend.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*stranger.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*multi.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*online.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*with.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*last 3.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*last 2.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*acquaintance.*', re.IGNORECASE), 'Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*options.*', re.IGNORECASE), 'Multiplayer', inplace=True,)

cleaned_df.replace(re.compile('.*both.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True)
cleaned_df.replace(re.compile('.*all of the above.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*all of them.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*all options.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*of all.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*all of the.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*single/multi.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*everything.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*alone and.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*single and multi.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*depends.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*5 choices.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*all.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*equal.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*versatile.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)
cleaned_df.replace(re.compile('.*one.*', re.IGNORECASE), 'Both Singleplayer and Multiplayer', inplace=True,)

cleaned_df["Playstyle"].value_counts()

Multiplayer                              10950
Singleplayer                               693
Both Singleplayer and Multiplayer           32
aram                                         1
Porn                                         1
Yolo queue                                   1
mixture of acquintances and randomers        1
M                                            1
watching                                     1
League                                       1
Name: Playstyle, dtype: int64

In [6]:
cleaned_df['Playstyle'] = np.where(
   (cleaned_df['Playstyle'] !="Singleplayer") & (cleaned_df['Playstyle'] != "Multiplayer") & 
    (cleaned_df['Playstyle'] !="Both Singleplayer and Multiplayer") , "Other", cleaned_df["Playstyle"]
   )

cleaned_df["Playstyle"].value_counts()

Multiplayer                          10950
Singleplayer                           693
Both Singleplayer and Multiplayer       32
Other                                    7
Name: Playstyle, dtype: int64