In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import shutil

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# SECTION: What do we have here!?

# Loading the dataset
ReviewsFile = '../input/googleplaystore_user_reviews.csv'
PlayStoreFile = '../input/googleplaystore.csv'
dfReviews = pd.read_csv(ReviewsFile)
dfPlayStore = pd.read_csv(PlayStoreFile)

print('Reviews Dataset size: {}'.format(len(dfReviews)))
print('PlayStore Dataset size: {}'.format(len(dfPlayStore)))
print('\n')
print(dfReviews.info())
print('\n')
print(dfPlayStore.info())

In [None]:
# SECTION: Creating one single Dataframe with all required columns

# INTENT:
# 1. Let us have 'Rating' as the label
# 2. The goal will be to, provided the following information, predict rating:
#   a. Reviews
#   b. Installs
#   c. Genres (For a more complex version)

# dfReviews is not of any particular use considering the above intent
# Modifying dfPlayStore as required

dfPlayStore.drop(columns = ['App', 'Category', 'Size',\
                            'Type', 'Price', 'Content Rating',\
                            'Last Updated', 'Current Ver', 'Android Ver'] , axis = 1, inplace = True)
dfPlayStore[:5]

In [None]:
# Check the values against 'Installs' column
dfPlayStore.groupby('Installs').count()

# There is an unwanted value called 'Free'. Remove it.
dfPlayStore = dfPlayStore[(dfPlayStore['Installs'] != 'Free')]

# Length of 'Ratings' column
len(dfPlayStore['Rating'])
# Length of 'Ratings' column without null values
dfPlayStore['Rating'].count()
# So, quiet a few rows with null ratings. Remove them.
dfPlayStore.dropna(axis = 0, subset = ['Rating'], inplace = True, how = 'any')

# Count of records in the Dataframe now
len(dfPlayStore)

In [None]:
# SECTION: Split Dataset for training, evaluation and testing

#SUB SECTION: Using feature 'Installs' for splitting the dataset. So, create hashs of its values.
# Extract all the unique 'Installs' values
tempDf = dfPlayStore.groupby('Installs').count()
installsLst = list(tempDf.index)

# Create a new Dataframe that will hold Hash values of 'Installs'
installs = pd.DataFrame(data = installsLst, columns = ['Installs'])
installs['Hash'] = [hash(installs.iloc[i]['Installs']) for i in range(len(installs))]

installs

In [None]:
# SUB SECTION: Mege Hash values with the original DataFrame.
dfPlayStore = pd.merge(dfPlayStore, installs, left_on = 'Installs', right_on = 'Installs', how = 'inner')
dfPlayStore

In [None]:
# Create a new DF that will tell us at what number to stop splitting on each Hash value
individualHashSplit = dfPlayStore.groupby('Hash').count()['Rating'].values*.7
individualHashSplit = np.around(individualHashSplit)
dfIndividualHashSplit = pd.DataFrame(data = individualHashSplit, columns = ['Split'])
dfIndividualHashSplit['Hash'] = dfPlayStore.groupby('Hash').count()['Rating'].index
dfIndividualHashSplit['Total Records'] = dfPlayStore.groupby('Hash').count()['Rating'].values

dfPlayStore

In [None]:
# Create a dictionary of Hashes and values
# We go on decrementing the values during splitting of the dataset
hashTreshDict = {}
for i in range(len(dfIndividualHashSplit)):
    # Directly doing an iloc on the Data Frame is converting 'Hash' column to float
    # So, the below workaround instead
    hashSeries = dfIndividualHashSplit['Hash']
    hashVal = hashSeries.iloc[i]
    
    splitVal = dfIndividualHashSplit.iloc[i]['Split']
    hashTreshDict[hashVal] = splitVal

#Initialize all variables that we require during splitting
testDFDict = {}
trainDFDict = {}
testKey = 0
trainKey = 0

arrRating_Test = np.array([])
arrReviews_Test = np.array([])
arrInstalls_Test = np.array([])
arrGenres_Test = np.array([])
arrHash_Test = np.array([])

arrRating_Train = np.array([])
arrReviews_Train = np.array([])
arrInstalls_Train = np.array([])
arrGenres_Train = np.array([])
arrHash_Train = np.array([])

# Do the actual splitting
for i in range(len(dfPlayStore)):
    # First of all, get the hash value
    getHash = dfPlayStore.iloc[i]['Hash']

    # Decrement count from the dictionary we are using for tracking
    hashTreshDict[getHash] = hashTreshDict[getHash] - 1
    
    # Create numpy arrays for Test Dataset
    if hashTreshDict[getHash] < 0:
        arrRating_Test = np.append(arrRating_Test, dfPlayStore.iloc[i]['Rating'])
        arrReviews_Test = np.append(arrReviews_Test, dfPlayStore.iloc[i]['Reviews'])
        arrInstalls_Test = np.append(arrInstalls_Test, dfPlayStore.iloc[i]['Installs'])
        arrGenres_Test = np.append(arrGenres_Test, dfPlayStore.iloc[i]['Genres'])
        arrHash_Test = np.append(arrHash_Test, dfPlayStore.iloc[i]['Hash'])
        testKey = testKey + 1
    # Create numpy arrays for Train Dataset
    else:
        arrRating_Train = np.append(arrRating_Train, dfPlayStore.iloc[i]['Rating'])
        arrReviews_Train = np.append(arrReviews_Train, dfPlayStore.iloc[i]['Reviews'])
        arrInstalls_Train = np.append(arrInstalls_Train, dfPlayStore.iloc[i]['Installs'])
        arrGenres_Train = np.append(arrGenres_Train, dfPlayStore.iloc[i]['Genres'])
        arrHash_Train = np.append(arrHash_Train, dfPlayStore.iloc[i]['Hash'])
        trainKey = trainKey + 1

# Create Test DataFrame
testDFDict['Rating'] = arrRating_Test
testDFDict['Reviews'] = arrReviews_Test
testDFDict['Installs'] = arrInstalls_Test
testDFDict['Genres'] = arrGenres_Test
testDFDict['Hash'] = arrHash_Test
testDFPlayStore = pd.DataFrame(testDFDict)

# Create Train DataFrame
trainDFDict['Rating'] = arrRating_Train
trainDFDict['Reviews'] = arrReviews_Train
trainDFDict['Installs'] = arrInstalls_Train
trainDFDict['Genres'] = arrGenres_Train
trainDFDict['Hash'] = arrHash_Train
trainDFPlayStore = pd.DataFrame(trainDFDict)

In [None]:
# Create Train and Test CSVs
testDFPlayStore.to_csv('test.csv')
trainDFPlayStore.to_csv('train.csv')
!ls -lrt
!head test.csv
!head train.csv

In [None]:
i = 0
reviewsFloat = np.array([])
for i in range(len(trainDFPlayStore['Reviews'])):
    reviewsFloat = np.append(reviewsFloat, float(trainDFPlayStore['Reviews'][i]))

reviewsFloat
trainDFPlayStore['ReviewsFloat'] = reviewsFloat
trainDFPlayStore[:5]

In [None]:
# Create Input Function, feature tensors etc.
trainInputFunction = tf.estimator.inputs.pandas_input_fn(
    x = trainDFPlayStore,
    y= trainDFPlayStore['Rating'],
    batch_size = 128,
    num_epochs = 100,
    shuffle = True,
    num_threads=1
)

testInputFunction = tf.estimator.inputs.pandas_input_fn(
    x = testDFPlayStore,
    y = None,
    batch_size = 128,
    shuffle = False,
    num_threads = 1
)

ratingT = tf.feature_column.numeric_column('Rating')
reviewsT = tf.feature_column.numeric_column('ReviewsFloat')
installsT = tf.feature_column.indicator_column('Installs')

In [None]:
# SECTION: Train a Linear Model
OUTDIR = 'Trained-Linear'
shutil.rmtree(OUTDIR, ignore_errors = True)

model = tf.estimator.LinearRegressor(
      feature_columns = [ratingT, reviewsT], model_dir = OUTDIR)

model.train(input_fn = trainInputFunction)

In [None]:
# Train a DNN Regressor
OUTDIR = 'Trained-DNN'
shutil.rmtree(OUTDIR, ignore_errors = True)

Estimator_DNN = tf.estimator.DNNRegressor(
    feature_columns = [ratingT, reviewsT],
    model_dir = OUTDIR,
    hidden_units=[8, 4]
)

Estimator_DNN.train(input_fn = trainInputFunction)