In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a pandas dataframe
df = pd.read_csv('googleplaystore.csv')
df.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [3]:
# Check for missing values
print(df.isnull().sum())

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64


In [4]:
# Create new dataframe with null values removed
df_no_null = df.dropna()

# Verify the new dataframe
print(df_no_null.isnull().sum())

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64


In [5]:
# Drop the "Current Ver" and "Android Ver" columns
df_no_null.drop(['App', 'Last Updated', 'Current Ver', 'Android Ver'], axis=1, inplace=True)
df_no_null.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_null.drop(['App', 'Last Updated', 'Current Ver', 'Android Ver'], axis=1, inplace=True)


Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design
1,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play


In [6]:
# Convert Installs to a numeric format
df_no_null['Installs'] = df_no_null['Installs'].apply(lambda x: int(re.sub('[^0-9]', '', x)))

# Convert "Size" column to numeric format
df_no_null['Size'] = df_no_null['Size'].apply(lambda x: float(re.sub('[^0-9\.]', '', x)) if x != 'Varies with device' else 0)

# Convert "Price" column to numeric format
df_no_null['Price'] = df_no_null['Price'].apply(lambda x: float(x.replace('$', '')) if x != '0' else 0)

df_no_null.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_null['Installs'] = df_no_null['Installs'].apply(lambda x: int(re.sub('[^0-9]', '', x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_null['Size'] = df_no_null['Size'].apply(lambda x: float(re.sub('[^0-9\.]', '', x)) if x != 'Varies with device' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design
1,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play
2,ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design


In [7]:
print(df_no_null['Category'].value_counts())

FAMILY                 1746
GAME                   1097
TOOLS                   733
PRODUCTIVITY            351
MEDICAL                 350
COMMUNICATION           328
FINANCE                 323
SPORTS                  319
PHOTOGRAPHY             317
LIFESTYLE               314
PERSONALIZATION         312
BUSINESS                303
HEALTH_AND_FITNESS      297
SOCIAL                  259
SHOPPING                238
NEWS_AND_MAGAZINES      233
TRAVEL_AND_LOCAL        226
DATING                  195
BOOKS_AND_REFERENCE     178
VIDEO_PLAYERS           160
EDUCATION               155
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     124
FOOD_AND_DRINK          109
HOUSE_AND_HOME           76
WEATHER                  75
AUTO_AND_VEHICLES        73
LIBRARIES_AND_DEMO       64
ART_AND_DESIGN           61
COMICS                   58
PARENTING                50
EVENTS                   45
BEAUTY                   42
Name: Category, dtype: int64


In [8]:
print(df_no_null['Genres'].value_counts())

Tools                      732
Entertainment              533
Education                  468
Action                     358
Productivity               351
                          ... 
Parenting;Brain Games        1
Card;Brain Games             1
Tools;Education              1
Entertainment;Education      1
Strategy;Creativity          1
Name: Genres, Length: 115, dtype: int64


In [9]:
# Select the top 10 unique values for Category
top_categories = df_no_null['Category'].value_counts().nlargest(10).index.tolist()
df_no_null.loc[~df_no_null['Category'].isin(top_categories), 'Category'] = 'Other'

# Select the top 10 unique values for Genres
top_genres = df_no_null['Genres'].value_counts().nlargest(10).index.tolist()
df_no_null.loc[~df_no_null['Genres'].isin(top_genres), 'Genres'] = 'Other'


In [10]:
# Check the top 10 unique values in the "Category" column
print(df_no_null['Category'].value_counts().head(10))

# Check the top 10 unique values in the "Genres" column
print(df_no_null['Genres'].value_counts().head(10))


Other            3482
FAMILY           1746
GAME             1097
TOOLS             733
PRODUCTIVITY      351
MEDICAL           350
COMMUNICATION     328
FINANCE           323
SPORTS            319
PHOTOGRAPHY       317
Name: Category, dtype: int64
Other            5267
Tools             732
Entertainment     533
Education         468
Action            358
Productivity      351
Medical           350
Sports            333
Communication     328
Finance           323
Name: Genres, dtype: int64


In [11]:
# Create binary dummy columns for "Category", "Type", "Content Rating", and "Genres" columns
category_dummies = pd.get_dummies(df_no_null['Category'], prefix='Category', drop_first=True)

type_dummies = pd.get_dummies(df_no_null['Type'], prefix='Type', drop_first=True)

content_rating_dummies = pd.get_dummies(df_no_null['Content Rating'], prefix='Content Rating', drop_first=True)

genres_dummies = pd.get_dummies(df_no_null['Genres'], prefix='Genres', drop_first=True)

normalized_df = pd.concat([df_no_null, category_dummies, type_dummies, content_rating_dummies, genres_dummies], axis=1)

normalized_df.drop(['Category', 'Type', 'Content Rating', 'Genres'], axis=1, inplace=True)
normalized_df.head(3)

Unnamed: 0,Rating,Reviews,Size,Installs,Price,Category_FAMILY,Category_FINANCE,Category_GAME,Category_LIFESTYLE,Category_MEDICAL,...,Genres_Communication,Genres_Education,Genres_Entertainment,Genres_Finance,Genres_Medical,Genres_Other,Genres_Photography,Genres_Productivity,Genres_Sports,Genres_Tools
0,4.1,159,19.0,10000,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,3.9,967,14.0,500000,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,4.7,87510,8.7,5000000,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
# Create a StandardScaler object
scaler = StandardScaler()

# Scale the features in normalized_df
scaled_df = scaler.fit_transform(normalized_df.drop(['Installs', 'Rating'], axis=1))
scaled_df = pd.DataFrame(scaled_df, columns=normalized_df.columns[:-2])

# Split the scaled data into training and testing sets
X_scaled = scaled_df
y_scaled_installs = normalized_df['Installs']
y_scaled_rating = normalized_df['Rating']
X_train_scaled_installs, X_test_scaled_installs, y_train_scaled_installs, y_test_scaled_installs = train_test_split(X_scaled, y_scaled_installs, test_size=0.2, random_state=42)
X_train_scaled_rating, X_test_scaled_rating, y_train_scaled_rating, y_test_scaled_rating = train_test_split(X_scaled, y_scaled_rating, test_size=0.2, random_state=42)

# Train a linear regression model for Installs
linreg_scaled_installs = LinearRegression()
linreg_scaled_installs.fit(X_train_scaled_installs, y_train_scaled_installs)

# Evaluate the performance of the model for Installs
y_pred_scaled_installs = linreg_scaled_installs.predict(X_test_scaled_installs)
mse_scaled_installs = mean_squared_error(y_test_scaled_installs, y_pred_scaled_installs)
rmse_scaled_installs = math.sqrt(mse_scaled_installs)
r2_scaled_installs = r2_score(y_test_scaled_installs, y_pred_scaled_installs)
print("RMSE for Installs:", rmse_scaled_installs)
print("R2 score for Installs:", r2_scaled_installs)

# Train a linear regression model for Rating
linreg_scaled_rating = LinearRegression()
linreg_scaled_rating.fit(X_train_scaled_rating, y_train_scaled_rating)

print('--------------------------------')

# Evaluate the performance of the model for Rating
y_pred_scaled_rating = linreg_scaled_rating.predict(X_test_scaled_rating)
mse_scaled_rating = mean_squared_error(y_test_scaled_rating, y_pred_scaled_rating)
rmse_scaled_rating = math.sqrt(mse_scaled_rating)
r2_scaled_rating = r2_score(y_test_scaled_rating, y_pred_scaled_rating)
print("RMSE for Rating:", rmse_scaled_rating)
print("R2 score for Rating:", r2_scaled_rating)

RMSE for Installs: 68611470.27500103
R2 score for Installs: 0.4527619901689254
--------------------------------
RMSE for Rating: 0.5018654141742721
R2 score for Rating: 0.025164644632002187
