# Attemp number 2
This attemp includes findings from my previous attemp, bringing them together to form a more refined approach at producing a fully cleaned dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

script_path = Path.cwd()
data_path_cleaned = script_path / "data" / "games_cleaned.csv"

df = pd.read_csv(data_path_cleaned, encoding='utf-8')

# Dtatype conversion
df['Release date'] = pd.to_datetime(df['Release date'], errors='coerce')
print(df.dtypes)

AppID                                int64
Name                                object
Release date                datetime64[ns]
Estimated owners                    object
Required age                         int64
Price                              float64
User score                           int64
Positive                             int64
Negative                             int64
Recommendations                      int64
Average playtime forever             int64
Developers                          object
Publishers                          object
Categories                          object
Genres                              object
Tags                                object
Average owners                     float64
Positive ratio                     float64
Estimated average owners           float64
dtype: object


## Estimated average owners

In [None]:
# Split 'Estimated owners' into two columns and calculate the average
owners_clean = owners_clean.str.replace(' ', '', regex=False)

owners_split = owners_clean.str.split('-', expand=True)

owners_split[0] = pd.to_numeric(owners_split[0], errors='coerce')
owners_split[1] = pd.to_numeric(owners_split[1], errors='coerce')

df['Estimated average owners'] = (owners_split[0] + owners_split[1]) / 2

print(df[['Estimated owners', 'Estimated average owners']].head(10))

  Estimated owners  Estimated average owners
0        0 - 20000                   10000.0
1        0 - 20000                   10000.0
2   50000 - 100000                   10000.0
3        0 - 20000                   10000.0
4   50000 - 100000                   10000.0
5        0 - 20000                   75000.0
6        0 - 20000                   10000.0
7        0 - 20000                   10000.0
8  100000 - 200000                   10000.0
9        0 - 20000                   75000.0


## Reranging the dataset

In [None]:
# Adding new column for lowercase names
df['Name_lowercase'] = df['Name'].str.lower().str.strip()

# Check for duplicates
duplicates = df[df['Name_lowercase'].duplicated(keep=False)]
print(duplicates.shape)

# Reranging the dataset based on the priorities while making new columns for better sorting
df['Positive ratio'] = df['Positive'] / (df['Positive'] + df['Negative']).replace(0, np.nan)

df_sorted = df.sort_values(
    by=['Estimated average owners', 'Positive ratio', 'Release date'],
    ascending=[False, False, False]
)

print(df_sorted[['Estimated average owners','Positive ratio','Release date']].head(10))

(660, 20)
       Estimated average owners  Positive ratio Release date
17585               150000000.0        0.451613   2018-03-27
7030                 75000000.0        1.000000   2024-02-08
30583                75000000.0        0.990932   2020-09-03
8885                 75000000.0        0.800000   2019-01-14
46158                75000000.0        0.796296   2017-01-05
96966                75000000.0        0.222222   2017-04-05
7015                 35000000.0        1.000000   2024-03-21
28893                35000000.0        1.000000   2019-11-14
31603                35000000.0        0.967552   2019-11-05
32756                35000000.0        0.952239   2023-09-29


### Missing values

In [None]:
# Initial exploration of missing values
print(df.isnull().sum())

AppID                            0
Name                             1
Release date                   129
Estimated owners                 0
Required age                     0
Price                            0
User score                       0
Positive                         0
Negative                         0
Recommendations                  0
Average playtime forever         0
Developers                    6443
Publishers                    6741
Categories                    7533
Genres                        6414
Tags                         36873
Average owners              110326
Positive ratio               37036
Estimated average owners        23
Name_lowercase                   1
dtype: int64


In [None]:
# Number of missing values in major columns that can be found in duplicates
columns_to_check = ['Developers', 'Publishers', 'Categories', 'Genres', 'Tags']

for col in columns_to_check:
    missing_rows = df[df[col].isnull()]
    
    can_be_filled = missing_rows['Name_lowercase'].isin(
        df[~df[col].isnull()]['Name_lowercase']
    )
    
    fillable_rows = missing_rows[can_be_filled]
    
    print(f"{col}: {len(fillable_rows)} rows can be filled from duplicates.")


Developers: 2 rows can be filled from duplicates.
Publishers: 4 rows can be filled from duplicates.
Categories: 6 rows can be filled from duplicates.
Genres: 0 rows can be filled from duplicates.
Tags: 153 rows can be filled from duplicates.



To fill the datas across duplicated game entries, I grouped rows by `Name_lowercase` and updated using `.update` with `.ffill` and `.bfill` to propagate only missing values within each group. This way, no matter which duplicate held the information, all rows for the same game now share the filled values


In [None]:
# Filling missing values from the duplicates
cols_to_fill = ['Developers', 'Publishers', 'Categories', 'Genres', 'Tags']

for col in cols_to_fill:
    df.update(df.groupby('Name_lowercase')[col].ffill())
    df.update(df.groupby('Name_lowercase')[col].bfill())


In [None]:
# Double checking the missing values after filling
cols_to_check = ['Developers', 'Publishers', 'Categories', 'Genres', 'Tags']

for col in cols_to_check:
    missing_rows = df[df[col].isnull()]
    
    can_be_filled = missing_rows['Name_lowercase'].isin(
        df[~df[col].isnull()]['Name_lowercase']
    )
    
    fillable_rows = missing_rows[can_be_filled]
    
    print(f"{col}: {len(fillable_rows)} rows can be filled from duplicates.")


Developers: 0 rows can be filled from duplicates.
Publishers: 0 rows can be filled from duplicates.
Categories: 0 rows can be filled from duplicates.
Genres: 0 rows can be filled from duplicates.
Tags: 0 rows can be filled from duplicates.


After filling missing data within duplicate groups, 5 rows are still empty, althought is being detected as the duplicate contains the much needed information, but my .update approach does not seem to work for this specific situations.

This may be because of 1. small differences in how the groups were formed or 2. because the update step did not replace the missing values.  
Since the number is small, the next step is to check these rows directly or combine the duplicates into a single row per game.

In [None]:
print(df.isnull().sum())

AppID                            0
Name                             1
Release date                   129
Estimated owners                 0
Required age                     0
Price                            0
User score                       0
Positive                         0
Negative                         0
Recommendations                  0
Average playtime forever         0
Developers                    6441
Publishers                    6737
Categories                    7527
Genres                        6414
Tags                         36720
Average owners              110326
Positive ratio               37036
Estimated average owners        23
Name_lowercase                   1
dtype: int64


In [None]:
# Only 6 rows still have missing values, therefore dropping them might be the better option. 
# By dropping these rows, we should also be dropping the rows with missing values in Name_lowercase
df = df.dropna(subset=['Name'])

#filling remaining string columns with 'Unknown'
df['Release date'] = df['Release date'].fillna('Unknown')
df['Developers'] = df['Developers'].fillna('Unknown')
df['Publishers'] = df['Publishers'].fillna('Unknown')
df['Categories'] = df['Categories'].fillna('Unknown')
df['Genres'] = df['Genres'].fillna('Unknown')
df['Tags'] = df['Tags'].fillna('Unknown')

# Filling remaining numeric columns with 0
df['Estimated average owners'] = df['Estimated average owners'].fillna(0)
df['Positive ratio'] = df['Positive ratio'].fillna(0)

print(df.isnull().sum())

AppID                            0
Name                             0
Release date                     0
Estimated owners                 0
Required age                     0
Price                            0
User score                       0
Positive                         0
Negative                         0
Recommendations                  0
Average playtime forever         0
Developers                       0
Publishers                       0
Categories                       0
Genres                           0
Tags                             0
Average owners              110325
Positive ratio                   0
Estimated average owners         0
Name_lowercase                   0
dtype: int64


### Duplicates

In [None]:
# Sorting by the priorities
df_sorted = df.sort_values(
    by=['Estimated average owners', 'Positive ratio', 'Release date'],
    ascending=[False, False, False]
)

# Keeping the first duplicate as its being sorted already
df_drop = df_sorted.drop_duplicates('Name_lowercase', keep='first')
df_drop.to_csv(data_path_cleaned, index=False, encoding="utf-8")

print("cleaned")

cleaned


In [None]:
# Double checking the duplicates in the final cleaned file
data_path_final = script_path / "data" / "games_cleaned_final.csv"
df2 = pd.read_csv(data_path_final, encoding='utf-8')

duplicates2 = df2[df2['Name_lowercase'].duplicated(keep=False)]
print(duplicates2.shape)

(0, 19)
