###### Importing dependensies

In [41]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [56]:
df = pd.read_csv("google_play_store_dataset.csv")

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


###### Data Cleaning

###### Reviews

In [58]:
df["Reviews"].head(5)

0       159
1       967
2     87510
3    215644
4       967
Name: Reviews, dtype: object

In [59]:
# Clean the Reviews column
df['Reviews'] = df['Reviews'].astype(str)

# Remove M and K
df['Reviews'] = df['Reviews'].str.replace('M','000000', regex=False)
df['Reviews'] = df['Reviews'].str.replace('k','000', regex=False)

# Remove decimal points if any
df['Reviews'] = df['Reviews'].str.replace('.','')

# Convert to numeric safely
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')


In [60]:
#filling NAN with 0
df['Reviews'] = df['Reviews'].fillna(0).astype(int)

In [61]:
#Verifying
df["Reviews"]

0           159
1           967
2         87510
3        215644
4           967
          ...  
10836        38
10837         4
10838         3
10839       114
10840    398307
Name: Reviews, Length: 10841, dtype: int32

###### Size

In [62]:
df['Size']

0                       19M
1                       14M
2                      8.7M
3                       25M
4                      2.8M
                ...        
10836                   53M
10837                  3.6M
10838                  9.5M
10839    Varies with device
10840                   19M
Name: Size, Length: 10841, dtype: object

In [63]:
df['Size'] = df['Size'].replace('Varies with device', np.nan)

def clean_size(value):
    """Convert size strings like '19M' or '14k' to MB (float)."""
    if isinstance(value, str):
        value = value.strip()
        
        # Case 1: Size ends with 'M'
        if value.endswith("M"):
            return float(value.replace("M", ""))
        
        # Case 2: Size ends with 'k'
        elif value.endswith("k"):
            return float(value.replace("k", "")) / 1024
        
    # return "nan" if not in a string
    return np.nan

df['Size'] = df['Size'].apply(clean_size)

In [64]:
#Verifying
df['Size'].head(5)

0    19.0
1    14.0
2     8.7
3    25.0
4     2.8
Name: Size, dtype: float64

###### Installs

In [65]:
df['Installs'].head(20)

0         10,000+
1        500,000+
2      5,000,000+
3     50,000,000+
4        100,000+
5         50,000+
6         50,000+
7      1,000,000+
8      1,000,000+
9         10,000+
10     1,000,000+
11     1,000,000+
12    10,000,000+
13       100,000+
14       100,000+
15         5,000+
16       500,000+
17        10,000+
18     5,000,000+
19    10,000,000+
Name: Installs, dtype: object

In [66]:
df[df['Installs'].str.contains('[A-Za-z]', na=False)][['App','Installs']]


Unnamed: 0,App,Installs
10472,Life Made WI-Fi Touchscreen Photo Frame,Free


In [67]:
# Convert Installs column to string
df['Installs'] = df['Installs'].astype(str)

# Remove + and commas
df['Installs'] = df['Installs'].str.replace('+','', regex=False)
df['Installs'] = df['Installs'].str.replace(',','', regex=False)

# Remove any non-digit characters (e.g., 'Free')
df['Installs'] = df['Installs'].str.extract('(\d+)', expand=False)

# Convert to numeric safely
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')


In [69]:
#verifying
df["Installs"].head()

0       10000.0
1      500000.0
2     5000000.0
3    50000000.0
4      100000.0
Name: Installs, dtype: float64

###### Price

In [71]:
df["Price"].head()

0    0
1    0
2    0
3    0
4    0
Name: Price, dtype: object

In [75]:
# Replace 'Free' with 0
df['Price'] = df['Price'].replace('Free', '0')

# Keep only values that are actual prices
df['Price'] = df['Price'].apply(lambda x: x if str(x).startswith('$') or str(x).isdigit() else '0')

# Remove $ sign
df['Price'] = df['Price'].str.replace('$', '', regex=False)

# Convert to float
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0)

In [78]:
#Verifying
df["Price"].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Price, dtype: float64

###### Rating

In [80]:
df['Rating']

0        4.1
1        3.9
2        4.7
3        4.5
4        4.3
        ... 
10836    4.5
10837    5.0
10838    NaN
10839    4.5
10840    4.5
Name: Rating, Length: 10841, dtype: float64

In [81]:
#Handling missing values
df['Rating'] = df.groupby('Category')['Rating'].transform(lambda x: x.fillna(x.mean()))

In [82]:
#verifying
df["Rating"]

0        4.100000
1        3.900000
2        4.700000
3        4.500000
4        4.300000
           ...   
10836    4.500000
10837    5.000000
10838    4.189143
10839    4.500000
10840    4.500000
Name: Rating, Length: 10841, dtype: float64

###### Last Updated

In [84]:
df["Last Updated"].head()

0     January 7, 2018
1    January 15, 2018
2      August 1, 2018
3        June 8, 2018
4       June 20, 2018
Name: Last Updated, dtype: object

In [87]:
# Convert to string

df['Last Updated'] = df['Last Updated'].astype(str)

import re
def valid_date(s):
    match = re.search(r'\b(19|20)\d{2}\b', s)
    return s if match else np.nan

df['Last Updated'] = df['Last Updated'].apply(valid_date)

# Convert to datetime safely, invalid parsing becomes NaT
df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

In [93]:
#Handling missing values
df['Last Updated'] = df.groupby('Category')['Last Updated'].transform(
    lambda x: x.fillna(x.median())
)

In [94]:
# Verifying
df["Last Updated"].head()

0   2018-01-07
1   2018-01-15
2   2018-08-01
3   2018-06-08
4   2018-06-20
Name: Last Updated, dtype: datetime64[ns]

###### Categorical columns

In [86]:
df['Type'] = df['Type'].fillna('Free')
df['Content Rating'] = df['Content Rating'].fillna(df['Content Rating'].mode()[0])
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].mode()[0])
df['Android Ver'] = df['Android Ver'].fillna(df['Android Ver'].mode()[0])

In [95]:
#verifying df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             10841 non-null  object        
 1   Category        10841 non-null  object        
 2   Rating          10841 non-null  float64       
 3   Reviews         10841 non-null  int32         
 4   Size            9145 non-null   float64       
 5   Installs        10840 non-null  float64       
 6   Type            10841 non-null  object        
 7   Price           10841 non-null  float64       
 8   Content Rating  10841 non-null  object        
 9   Genres          10841 non-null  object        
 10  Last Updated    10840 non-null  datetime64[ns]
 11  Current Ver     10841 non-null  object        
 12  Android Ver     10841 non-null  object        
dtypes: datetime64[ns](1), float64(4), int32(1), object(7)
memory usage: 1.0+ MB


In [92]:
df.isnull().sum()

App                  0
Category             0
Rating               0
Reviews              0
Size              1696
Installs             1
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         1
Current Ver          0
Android Ver          0
dtype: int64

In [96]:
# Filling remaining numeric columns
df['Size'] = df['Size'].fillna(df['Size'].mean())

df['Installs'] = df['Installs'].fillna(df['Installs'].mean())

# Fill Last Updated with median date
df['Last Updated'] = df['Last Updated'].fillna(df['Last Updated'].median())

# Verify
df.isna().sum()


App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

###### Saving cleaned dataset

In [97]:
df.to_csv("cleaned.csv", index = False)