# Loading Android Data Set

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib
import scipy


android = pd.read_csv('Baird_original.csv', index_col = 0)

# Data Quality Report Generation

In [2]:
outputString = ""

for feature in android:

    print(android[feature].name)
    print("Count: ", android[feature].count())
    print("Missing %: ", android[feature].isna().sum() / android[feature].count())
    print("Card. : ", android[feature].nunique())
    
    if not android[feature].dtype == 'object':
        
        quartiles = android[feature].quantile([0.25,0.75]).values
        
        print("Min: ", android[feature].min())
        print("1st Quartile: ", quartiles[0])
        print("Mean: ", android[feature].mean())
        print("Median: ", android[feature].median())
        print("3rt Quartile: ", quartiles[1])
        print("Max: ", android[feature].max())
        print("Std Dev: ", android[feature].std(), "\n\n")
        
    else:
        
        print("Mode: ", android[feature].mode()[0])
        print("Mode Freq: ", android[feature].value_counts()[0])
        print("Mode %: ", android[feature].value_counts()[0] / android[feature].count())
        print("2nd Mode: ", android[feature].value_counts().axes[0][1])
        print("2nd Mode Freq: ", android[feature].value_counts()[1])
        print("2nd Mode %: ", android[feature].value_counts()[1] / android[feature].count() ,"\n\n")
    
    

Category
Count:  10840
Missing %:  0.0
Card. :  33
Mode:  FAMILY
Mode Freq:  1972
Mode %:  0.1819188191881919
2nd Mode:  GAME
2nd Mode Freq:  1144
2nd Mode %:  0.1055350553505535 


Rating
Count:  9366
Missing %:  0.15737774930600043
Card. :  39
Min:  1.0
1st Quartile:  4.0
Mean:  4.191757420456972
Median:  4.3
3rt Quartile:  4.5
Max:  5.0
Std Dev:  0.5152188586177886 


Reviews
Count:  10840
Missing %:  0.0
Card. :  6001
Min:  0
1st Quartile:  38.0
Mean:  444152.89603321033
Median:  2094.0
3rt Quartile:  54775.5
Max:  78158306
Std Dev:  2927760.603885666 


Size
Count:  10840
Missing %:  0.0
Card. :  461
Mode:  Varies with device
Mode Freq:  1695
Mode %:  0.15636531365313652
2nd Mode:  11M
2nd Mode Freq:  198
2nd Mode %:  0.018265682656826567 


Type
Count:  10839
Missing %:  9.225943352707814e-05
Card. :  2
Mode:  Free
Mode Freq:  10039
Mode %:  0.9261924531783375
2nd Mode:  Paid
2nd Mode Freq:  800
2nd Mode %:  0.07380754682166252 


Price
Count:  10840
Missing %:  0.0
Card. :  92
M

# Data Prep

In [3]:
android = android.dropna()

if android.Price.dtype == 'object':
    android.Price = [x.strip('$') for x in android.Price]

if android.Installs.dtype == 'object':
    android.Installs = [x.replace('+','') for x in android.Installs]
    android.Installs = [x.replace(',','') for x in android.Installs]
    z = 1
    
if android.Size.dtype == 'object':
    android.Size = [x.replace('M','') for x in android.Size]
    android.Size = [x.replace('Varies with device','1') for x in android.Size]
    android.Size = [x.replace('k','') for x in android.Size]
    
   

#Converts certain features from strings to numeric
android['Reviews'] = pd.to_numeric(android['Reviews'])
android['Price'] = pd.to_numeric(android['Price'])
android['Installs'] = pd.to_numeric(android['Installs'])
android['Size'] = pd.to_numeric(android['Size'])

android = android.dropna()

android.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9360 entries, Photo Editor & Candy Camera & Grid & ScrapBook to iHoroscope - 2018 Daily Horoscope & Astrology
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        9360 non-null   object 
 1   Rating          9360 non-null   float64
 2   Reviews         9360 non-null   int64  
 3   Size            9360 non-null   float64
 4   Type            9360 non-null   object 
 5   Price           9360 non-null   float64
 6   Content Rating  9360 non-null   object 
 7   Genres          9360 non-null   object 
 8   Last Updated    9360 non-null   object 
 9   Current Ver     9360 non-null   object 
 10  Android Ver     9360 non-null   object 
 11  Installs        9360 non-null   int64  
dtypes: float64(3), int64(2), object(7)
memory usage: 658.1+ KB


# Outlier management

In [4]:
android = android[np.abs(android.Installs-android.Installs.mean()) <= (3*android.Installs.std())]
android.Genres = [x.split(';')[0] for x in android.Genres]
android['Genres'] = android['Genres'].replace('Educational', 'Education')
android['Genres'] = android['Genres'].replace('Music & Audio', 'Music') 
android = android[np.abs(android.Price-android.Price.mean()) <= (3*android.Price.std())]

# Class Mapping and more Outlier Management


In [5]:
class_mapping = {label:idx for idx, label in enumerate(np.unique(android['Type']))}
android['Type'] = android['Type'].map(class_mapping)

class_mapping = {label:idx for idx, label in enumerate(np.unique(android['Category']))}
android['Category'] = android['Category'].map(class_mapping)

class_mapping = {label:idx for idx, label in enumerate(np.unique(android['Content Rating']))}
android['Content Rating'] = android['Content Rating'].map(class_mapping)

class_mapping = {label:idx for idx, label in enumerate(np.unique(android['Genres']))}
android['Genres'] = android['Genres'].map(class_mapping)

android = android[np.abs(android.Category-android.Category.mean()) <= (3*android.Category.std())]
android = android[np.abs(android['Content Rating']-android['Content Rating'].mean()) <= (3*android['Content Rating'].std())]
android = android[np.abs(android.Genres-android.Genres.mean()) <= (3*android.Genres.std())]

android.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9212 entries, Photo Editor & Candy Camera & Grid & ScrapBook to iHoroscope - 2018 Daily Horoscope & Astrology
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        9212 non-null   int64  
 1   Rating          9212 non-null   float64
 2   Reviews         9212 non-null   int64  
 3   Size            9212 non-null   float64
 4   Type            9212 non-null   int64  
 5   Price           9212 non-null   float64
 6   Content Rating  9212 non-null   int64  
 7   Genres          9212 non-null   int64  
 8   Last Updated    9212 non-null   object 
 9   Current Ver     9212 non-null   object 
 10  Android Ver     9212 non-null   object 
 11  Installs        9212 non-null   int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 791.7+ KB


# Min Max Normalization

In [6]:
from sklearn import preprocessing 

df = android[["Category","Rating","Reviews","Size","Type","Price","Content Rating","Genres","Installs"]].copy()
df = df.drop_duplicates()

df_cols = df.columns[0:8]
X = df[['Category','Rating','Reviews','Size','Type','Price','Content Rating','Genres']].values
y = df[['Installs']].values

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
NormalizedDf = pd.DataFrame(x_scaled, columns = df_cols)
NormalizedDf['Installs'] = df['Installs'].values

NormalizedDf.head(10)

df = NormalizedDf

# Feature Selection

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from numpy import set_printoptions

fX = df[['Category','Rating','Reviews','Size','Type','Price','Content Rating','Genres']].values
fY = df[['Installs']].values

test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(fX, fY)


set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(fX)


print(features[0:5,:])

[  3.487  32.269 414.642   3.76   48.115  21.42   11.558   2.814]
[[7.750e-01 3.519e-06 0.000e+00 0.000e+00]
 [7.250e-01 2.152e-05 0.000e+00 0.000e+00]
 [9.250e-01 1.949e-03 0.000e+00 0.000e+00]
 [8.750e-01 4.803e-03 0.000e+00 0.000e+00]
 [8.250e-01 2.152e-05 0.000e+00 0.000e+00]]


  y = column_or_1d(y, warn=True)


# ABT Creation

In [8]:
df.to_csv('Baird_pp.csv')