# <b>Character's data: Importing of libraries, moduels and data set conversion</b>

In [1]:
#installing pandas library - python-3 is already installed on this machine
!pip install pandas



In [2]:
#importing of necessary libraries
import pandas as pd # converting our imported .csv file into a dataframe
import numpy as np # for performing mathematical operations on 
import os #used in this case to locate the .csv file path

In [3]:
#os called to locate file path
file_path = os.path.join(os.getcwd(), "characters.csv")
file_path

'/home/simonet/CFG_Project_G4/CFG_Degree_Project_Group_4/notebooks/archive/characters.csv'

In [4]:
# Read unclean dataset to CSV and return first five rows for review
character_data = pd.read_csv('/home/simonet/my_jupyter_env/characters.csv')
character_data.head()

Unnamed: 0,Name,Gender,Game,Age,Age_range,Playable,Sexualization,Id,Species,Side,Relevance,Romantic_Interest
0,Farah,Female,CODMW,27,Adult,1,0,CODMW_Farah,Human,P,PA,No
1,Protagonist,Custom,PSS,Teenager,Teenager,1,0,PSS_Protagonist,Human,P,PA,No
2,Magnolia,Female,PSS,Elderly,Elderly,0,0,PSS_Magnolia,Human,P,SC,No
3,Sonia,Female,PSS,26,Adult,0,0,PSS_Sonia,Human,P,SC,No
4,Marnie,Female,PSS,Teenager,Teenager,0,0,PSS_Marnie,Human,B,MC,No


# <b>Data Cleaning</b>

In [5]:
#Initial check for null values, none 
character_data.isnull().sum().sum()

0

In [6]:
#Initial check for null values, none 
character_data.isnull().sum()

Name                 0
Gender               0
Game                 0
Age                  0
Age_range            0
Playable             0
Sexualization        0
Id                   0
Species              0
Side                 0
Relevance            0
Romantic_Interest    0
dtype: int64

## An inital sum taken of <b>null values</b> within the character dataset, in order to return a sum of the individual column sums of null a .sum().sum() can is called. A breakdown has been provided using the single .sum() function. No need for null removal as none in our dataset.

In [7]:
duplicate_count = character_data[character_data.duplicated()].shape[0]
print(duplicate_count)

0


## The dataset is checked for <b>duplicated values</b> as the presence of duplicates will skew our dataset, and result in an inaccurate representation of character metrics and relationships. No duplicate values exist in this dataset.

In [8]:
#character_data.dtypes, all data types correspond to data in column, can be manipulated with ease
character_data.dtypes

Name                 object
Gender               object
Game                 object
Age                  object
Age_range            object
Playable              int64
Sexualization         int64
Id                   object
Species              object
Side                 object
Relevance            object
Romantic_Interest    object
dtype: object

## Character dataset is reviewed for <b>dataypes</b>, a check is made for any inconsistencies in data typing. All data types listed are consistent with the data itself.<br> However, <b>Romantic_Interest</b> column will be converted to int64 data type for consistency and easier statistical use and for calculations.

In [9]:
character_data = character_data.rename(columns = {'Side':'Alliance'})
character_data = character_data.rename(columns = {'Relevance':'Function'})
character_data = character_data.rename(columns = {'Sexualization':'Sexualization_total'})

## Three of the <b>column's names have been changed</b> using the .rename() function. This is done in order to better capture the dynamics of the column's data. <br>'Side' changed to 'Alliance' to capture the relationship between the character of focus and others in the game, 'Relevance' has been changed to 'Function' to highlight the character's function in the game and seemed to be a more neutral term. <br>Finally, 'Sexualization' has been changed to 'Sexualization_total', a minor change but clarified this column is a sum of the sexualization types in the sexualization table.

In [10]:
uk_cd = character_data['Age'].value_counts()
uk_cd

Age
Adult          231
Unknown        136
Middle-aged     31
Teenager        27
Elderly         23
Child           21
Young adult     16
21               9
16               9
20               9
17               8
25               6
38               6
23               6
22               6
19               6
11               6
36               5
13               5
26               5
24               4
50               3
18               3
42               3
53               3
48               3
43               3
70               2
39               2
10               2
40               2
49               2
15               2
27               2
Infant           2
33               2
46               2
41               2
37               2
61               1
12               1
93               1
57               1
44               1
8                1
14               1
54               1
47               1
120              1
66               1
29               1
30               1
28      

In [11]:
character_data = character_data.drop(columns=['Age'])

## The dataset's 'Age' <b>column has been dropped</b> after a .value_counts() review was actioned, this is as the ages were useful however as a number of ages were called as 'unknown' the 'Age' column offered very little difference in terms of helpful added to the 'Age Range' column. These columns for our need appeared to be offering the same information.

In [12]:
#Reviewing dataset following changes so far
character_data.head()

Unnamed: 0,Name,Gender,Game,Age_range,Playable,Sexualization_total,Id,Species,Alliance,Function,Romantic_Interest
0,Farah,Female,CODMW,Adult,1,0,CODMW_Farah,Human,P,PA,No
1,Protagonist,Custom,PSS,Teenager,1,0,PSS_Protagonist,Human,P,PA,No
2,Magnolia,Female,PSS,Elderly,0,0,PSS_Magnolia,Human,P,SC,No
3,Sonia,Female,PSS,Adult,0,0,PSS_Sonia,Human,P,SC,No
4,Marnie,Female,PSS,Teenager,0,0,PSS_Marnie,Human,B,MC,No


In [13]:
#Altering Alliance column values -consideration clarity, however run time may be slightly longer - however preferred due to small dataset
character_data['Alliance'] = character_data['Alliance'].replace({'P':'Protagonist', 'A':'Antagonist', 'B':'Both'})

## 'Alliance' <b>column's initial's replaced</b> to the full word this is for those that are not part of the direct design team, those involved in the game making process that are not developers have a grasp of what alliance types there are. <br><br><i>This was not actioned for 'Function' although this would have been ideal, however, the running time was a consideration - in the case of 'Function' a key will be provided in visualizations to clarify the character function codes.</i>

In [14]:
#replacement of Romantic interest str with int, consideration consistency with 'Playable' and 'Sexualization' fields, may improve run time
character_data['Romantic_Interest'] = character_data['Romantic_Interest'].replace({'No':0, 'Yes':1, 'Opt':2}).astype(int)

  character_data['Romantic_Interest'] = character_data['Romantic_Interest'].replace({'No':0, 'Yes':1, 'Opt':2}).astype(int)


## The 'Romantic_Interest' columns values were also <b>replaced</b> using the .replace() function. Romantic interest changed from and object (str) datatype to int64, this change was actioned to ensure our 'Romantic_Interest' column could be used in calculations with ease.

In [15]:
character_data.dtypes

Name                   object
Gender                 object
Game                   object
Age_range              object
Playable                int64
Sexualization_total     int64
Id                     object
Species                object
Alliance               object
Function               object
Romantic_Interest       int64
dtype: object

In [16]:
#'Romantic_interest' column check using a filter for romantic level as option:
option_2 = character_data[character_data['Romantic_Interest'] == 2]
option_2.head(3)

Unnamed: 0,Name,Gender,Game,Age_range,Playable,Sexualization_total,Id,Species,Alliance,Function,Romantic_Interest
34,Triss,Female,TW3,Adult,0,2,TW3_Triss,Human,Protagonist,SC,2
35,Keira Metz,Female,TW3,Adult,0,2,TW3_Keira,Human,Protagonist,SC,2
248,Ann Takamaki,Female,PSN5,Teenager,1,2,PSN5_Ann,Human,Protagonist,DA,2


## <b>Cleaning data row: Name change</b>

In [17]:
name_clarif = character_data[character_data['Name'] == '???']
name_clarif

Unnamed: 0,Name,Gender,Game,Age_range,Playable,Sexualization_total,Id,Species,Alliance,Function,Romantic_Interest
594,???,Male,TBOIR,Child,1,0,TBOIR_???,Human,Protagonist,MC,0


## Reviewing the tables Name column a character named '???' was found. Just to be sure this was not an error, the video game The Binding of Issac: Four souls was researched for a character of this name. This is a correct character name entry, however the character also goes by another name. So the additional name 'Blue Baby' has been added, in case the character's name is used for our exploration it is clear that the use of '???' is not a mistake. So the <b>character's name has been altered/replaced</b> with '???BlueBaby'.

In [18]:
character_data = character_data.replace({'Name':{'???': '???BlueBaby'},'Id':{'TBOIR_???':'TBOIR_???BlueBaby'}})

In [19]:
find = character_data[character_data['Name'] == '???BlueBaby']
find

Unnamed: 0,Name,Gender,Game,Age_range,Playable,Sexualization_total,Id,Species,Alliance,Function,Romantic_Interest
594,???BlueBaby,Male,TBOIR,Child,1,0,TBOIR_???BlueBaby,Human,Protagonist,MC,0


In [20]:
character_data.head()

Unnamed: 0,Name,Gender,Game,Age_range,Playable,Sexualization_total,Id,Species,Alliance,Function,Romantic_Interest
0,Farah,Female,CODMW,Adult,1,0,CODMW_Farah,Human,Protagonist,PA,0
1,Protagonist,Custom,PSS,Teenager,1,0,PSS_Protagonist,Human,Protagonist,PA,0
2,Magnolia,Female,PSS,Elderly,0,0,PSS_Magnolia,Human,Protagonist,SC,0
3,Sonia,Female,PSS,Adult,0,0,PSS_Sonia,Human,Protagonist,SC,0
4,Marnie,Female,PSS,Teenager,0,0,PSS_Marnie,Human,Both,MC,0


In [21]:
character_data.to_csv('cleaned_characters.csv', index=False)