In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import shutil

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['googleplaystore_user_reviews.csv', 'license.txt', 'googleplaystore.csv']


In [2]:
# SECTION: What do we have here!?

# Loading the dataset
ReviewsFile = '../input/googleplaystore_user_reviews.csv'
PlayStoreFile = '../input/googleplaystore.csv'
dfReviews = pd.read_csv(ReviewsFile)
dfPlayStore = pd.read_csv(PlayStoreFile)

print('Reviews Dataset size: {}'.format(len(dfReviews)))
print('PlayStore Dataset size: {}'.format(len(dfPlayStore)))
print('\n')
print(dfReviews.info())
print('\n')
print(dfPlayStore.info())

Reviews Dataset size: 64295
PlayStore Dataset size: 10841


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
App                       64295 non-null object
Translated_Review         37427 non-null object
Sentiment                 37432 non-null object
Sentiment_Polarity        37432 non-null float64
Sentiment_Subjectivity    37432 non-null float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App               10841 non-null object
Category          10841 non-null object
Rating            9367 non-null float64
Reviews           10841 non-null object
Size              10841 non-null object
Installs          10841 non-null object
Type              10840 non-null object
Price             10841 non-null object
Content Rating    10840 non-null object
Genres            10841 non-null object
Last Updated      1084

In [3]:
# SECTION: Creating one single Dataframe with all required columns

# INTENT:
# 1. Let us have 'Rating' as the label
# 2. The goal will be to, provided the following information, predict rating:
#   a. Reviews
#   b. Installs
#   c. Genres (For a more complex version)

# dfReviews is not of any particular use considering the above intent
# Modifying dfPlayStore as required

dfPlayStore.drop(columns = ['App', 'Category', 'Size',\
                            'Type', 'Price', 'Content Rating',\
                            'Last Updated', 'Current Ver', 'Android Ver'] , axis = 1, inplace = True)
dfPlayStore[:5]

Unnamed: 0,Rating,Reviews,Installs,Genres
0,4.1,159,"10,000+",Art & Design
1,3.9,967,"500,000+",Art & Design;Pretend Play
2,4.7,87510,"5,000,000+",Art & Design
3,4.5,215644,"50,000,000+",Art & Design
4,4.3,967,"100,000+",Art & Design;Creativity


In [4]:
# Check the values against 'Installs' column
dfPlayStore.groupby('Installs').count()

# There is an unwanted value called 'Free'. Remove it.
dfPlayStore = dfPlayStore[(dfPlayStore['Installs'] != 'Free')]

# Length of 'Ratings' column
len(dfPlayStore['Rating'])
# Length of 'Ratings' column without null values
dfPlayStore['Rating'].count()
# So, quiet a few rows with null ratings. Remove them.
dfPlayStore.dropna(axis = 0, subset = ['Rating'], inplace = True, how = 'any')

# Count of records in the Dataframe now
len(dfPlayStore)

9366

In [5]:
# SECTION: Split Dataset for training, evaluation and testing

#SUB SECTION: Using feature 'Installs' for splitting the dataset. So, create hashs of its values.
# Extract all the unique 'Installs' values
tempDf = dfPlayStore.groupby('Installs').count()
installsLst = list(tempDf.index)

# Create a new Dataframe that will hold Hash values of 'Installs'
installs = pd.DataFrame(data = installsLst, columns = ['Installs'])
installs['Hash'] = [hash(installs.iloc[i]['Installs']) for i in range(len(installs))]

installs

Unnamed: 0,Installs,Hash
0,1+,-282863904388221609
1,"1,000+",9095878737950934241
2,"1,000,000+",92586550578197505
3,"1,000,000,000+",-3060612850577766263
4,10+,3231562878121177088
5,"10,000+",149360513842492561
6,"10,000,000+",5211666146099626289
7,100+,7521255356307719089
8,"100,000+",271865914791637336
9,"100,000,000+",-4906999090817040387


In [6]:
# SUB SECTION: Mege Hash values with the original DataFrame.
dfPlayStore = pd.merge(dfPlayStore, installs, left_on = 'Installs', right_on = 'Installs', how = 'inner')
dfPlayStore

Unnamed: 0,Rating,Reviews,Installs,Genres,Hash
0,4.1,159,"10,000+",Art & Design,149360513842492561
1,4.7,121,"10,000+",Art & Design;Creativity,149360513842492561
2,4.5,27,"10,000+",Art & Design,149360513842492561
3,4.8,192,"10,000+",Art & Design,149360513842492561
4,3.9,136,"10,000+",Art & Design,149360513842492561
5,4.7,353,"10,000+",Art & Design,149360513842492561
6,4.7,158,"10,000+",Art & Design,149360513842492561
7,4.2,117,"10,000+",Art & Design,149360513842492561
8,4.2,26,"10,000+",Art & Design,149360513842492561
9,4.6,534,"10,000+",Auto & Vehicles,149360513842492561


In [7]:
# Create a new DF that will tell us at what number to stop splitting on each Hash value
individualHashSplit = dfPlayStore.groupby('Hash').count()['Rating'].values*.7
individualHashSplit = np.around(individualHashSplit)
dfIndividualHashSplit = pd.DataFrame(data = individualHashSplit, columns = ['Split'])
dfIndividualHashSplit['Hash'] = dfPlayStore.groupby('Hash').count()['Rating'].index
dfIndividualHashSplit['Total Records'] = dfPlayStore.groupby('Hash').count()['Rating'].values

dfPlayStore

Unnamed: 0,Rating,Reviews,Installs,Genres,Hash
0,4.1,159,"10,000+",Art & Design,149360513842492561
1,4.7,121,"10,000+",Art & Design;Creativity,149360513842492561
2,4.5,27,"10,000+",Art & Design,149360513842492561
3,4.8,192,"10,000+",Art & Design,149360513842492561
4,3.9,136,"10,000+",Art & Design,149360513842492561
5,4.7,353,"10,000+",Art & Design,149360513842492561
6,4.7,158,"10,000+",Art & Design,149360513842492561
7,4.2,117,"10,000+",Art & Design,149360513842492561
8,4.2,26,"10,000+",Art & Design,149360513842492561
9,4.6,534,"10,000+",Auto & Vehicles,149360513842492561


In [8]:
# Create a dictionary of Hashes and values
# We go on decrementing the values during splitting of the dataset
hashTreshDict = {}
for i in range(len(dfIndividualHashSplit)):
    # Directly doing an iloc on the Data Frame is converting 'Hash' column to float
    # So, the below workaround instead
    hashSeries = dfIndividualHashSplit['Hash']
    hashVal = hashSeries.iloc[i]
    
    splitVal = dfIndividualHashSplit.iloc[i]['Split']
    hashTreshDict[hashVal] = splitVal

#Initialize all variables that we require during splitting
testDFDict = {}
trainDFDict = {}
testKey = 0
trainKey = 0

arrRating_Test = np.array([])
arrReviews_Test = np.array([])
arrInstalls_Test = np.array([])
arrGenres_Test = np.array([])
arrHash_Test = np.array([])

arrRating_Train = np.array([])
arrReviews_Train = np.array([])
arrInstalls_Train = np.array([])
arrGenres_Train = np.array([])
arrHash_Train = np.array([])

# Do the actual splitting
for i in range(len(dfPlayStore)):
    # First of all, get the hash value
    getHash = dfPlayStore.iloc[i]['Hash']

    # Decrement count from the dictionary we are using for tracking
    hashTreshDict[getHash] = hashTreshDict[getHash] - 1
    
    # Create numpy arrays for Test Dataset
    if hashTreshDict[getHash] < 0:
        arrRating_Test = np.append(arrRating_Test, dfPlayStore.iloc[i]['Rating'])
        arrReviews_Test = np.append(arrReviews_Test, dfPlayStore.iloc[i]['Reviews'])
        arrInstalls_Test = np.append(arrInstalls_Test, dfPlayStore.iloc[i]['Installs'])
        arrGenres_Test = np.append(arrGenres_Test, dfPlayStore.iloc[i]['Genres'])
        arrHash_Test = np.append(arrHash_Test, dfPlayStore.iloc[i]['Hash'])
        testKey = testKey + 1
    # Create numpy arrays for Train Dataset
    else:
        arrRating_Train = np.append(arrRating_Train, dfPlayStore.iloc[i]['Rating'])
        arrReviews_Train = np.append(arrReviews_Train, dfPlayStore.iloc[i]['Reviews'])
        arrInstalls_Train = np.append(arrInstalls_Train, dfPlayStore.iloc[i]['Installs'])
        arrGenres_Train = np.append(arrGenres_Train, dfPlayStore.iloc[i]['Genres'])
        arrHash_Train = np.append(arrHash_Train, dfPlayStore.iloc[i]['Hash'])
        trainKey = trainKey + 1

# Create Test DataFrame
testDFDict['Rating'] = arrRating_Test
testDFDict['Reviews'] = arrReviews_Test
testDFDict['Installs'] = arrInstalls_Test
testDFDict['Genres'] = arrGenres_Test
testDFDict['Hash'] = arrHash_Test
testDFPlayStore = pd.DataFrame(testDFDict)

# Create Train DataFrame
trainDFDict['Rating'] = arrRating_Train
trainDFDict['Reviews'] = arrReviews_Train
trainDFDict['Installs'] = arrInstalls_Train
trainDFDict['Genres'] = arrGenres_Train
trainDFDict['Hash'] = arrHash_Train
trainDFPlayStore = pd.DataFrame(trainDFDict)

In [9]:
# Create Train and Test CSVs
testDFPlayStore.to_csv('test.csv')
trainDFPlayStore.to_csv('train.csv')
!ls -lrt
!head test.csv
!head train.csv

total 628
-rw-r--r-- 1 root root  63782 May  2 14:21 __notebook__.ipynb
-rw-r--r-- 1 root root    270 May  2 14:21 __output__.json
-rw-r--r-- 1 root root 164799 May  2 14:21 test.csv
-rw-r--r-- 1 root root 389394 May  2 14:21 train.csv
,Rating,Reviews,Installs,Genres,Hash
0,4.4,137,"10,000+",House & Home,1.4936051384249258e+17
1,3.9,117,"10,000+",Shopping,1.4936051384249258e+17
2,4.3,87,"10,000+",Tools,1.4936051384249258e+17
3,4.6,1641,"10,000+",Sports,1.4936051384249258e+17
4,3.5,1151,"10,000+",Sports,1.4936051384249258e+17
5,4.4,766,"10,000+",Education;Education,1.4936051384249258e+17
6,3.8,52,"10,000+",Educational;Pretend Play,1.4936051384249258e+17
7,3.9,203,"10,000+",Tools,1.4936051384249258e+17
8,4.3,824,"10,000+",Tools,1.4936051384249258e+17
,Rating,Reviews,Installs,Genres,Hash
0,4.1,159,"10,000+",Art & Design,1.4936051384249258e+17
1,4.7,121,"10,000+",Art & Design;Creativity,1.4936051384249258e+17
2,4.5,27,"10,000+",Art & Design,1.4936051384249258e+17
3,4.8,1

In [10]:
'''# SECTION: Build a Neural Network to work on our dataset
print(tf.__version__)
print(tf.keras.__version__)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape = [1,], name = "Reviews"))
model.add(tf.keras.layers.Dense(8, activation = 'relu'))
model.add(tf.keras.layers.Dense(8, activation = 'relu'))

model.compile(optimizer = tf.keras.optimizers.Optimizer(), loss = tf.keras.losses.mean_squared_error, metrics = ['accuracy'])

ratingT = tf.feature_column.numeric_column('Rating')
reviewsT = tf.feature_column.numeric_column('Reviews')
installsT = tf.feature_column.indicator_column('Installs')

model.fit(x = {'Installs': arrInstalls_Train, 'Reviews': arrReviews_Train}, y = arrRating_Train, epochs = 2, verbose = 2, shuffle = True)
'''

'# SECTION: Build a Neural Network to work on our dataset\nprint(tf.__version__)\nprint(tf.keras.__version__)\n\nmodel = tf.keras.models.Sequential()\nmodel.add(tf.keras.layers.InputLayer(input_shape = [1,], name = "Reviews"))\nmodel.add(tf.keras.layers.Dense(8, activation = \'relu\'))\nmodel.add(tf.keras.layers.Dense(8, activation = \'relu\'))\n\nmodel.compile(optimizer = tf.keras.optimizers.Optimizer(), loss = tf.keras.losses.mean_squared_error, metrics = [\'accuracy\'])\n\nratingT = tf.feature_column.numeric_column(\'Rating\')\nreviewsT = tf.feature_column.numeric_column(\'Reviews\')\ninstallsT = tf.feature_column.indicator_column(\'Installs\')\n\nmodel.fit(x = {\'Installs\': arrInstalls_Train, \'Reviews\': arrReviews_Train}, y = arrRating_Train, epochs = 2, verbose = 2, shuffle = True)\n'

In [11]:
i = 0
reviewsFloat = np.array([])
for i in range(len(trainDFPlayStore['Reviews'])):
    reviewsFloat = np.append(reviewsFloat, float(trainDFPlayStore['Reviews'][i]))

reviewsFloat
trainDFPlayStore['ReviewsFloat'] = reviewsFloat
trainDFPlayStore[:5]

Unnamed: 0,Rating,Reviews,Installs,Genres,Hash,ReviewsFloat
0,4.1,159,"10,000+",Art & Design,1.493605e+17,159.0
1,4.7,121,"10,000+",Art & Design;Creativity,1.493605e+17,121.0
2,4.5,27,"10,000+",Art & Design,1.493605e+17,27.0
3,4.8,192,"10,000+",Art & Design,1.493605e+17,192.0
4,3.9,136,"10,000+",Art & Design,1.493605e+17,136.0


In [12]:
# Create Input Function, feature tensors etc.
trainInputFunction = tf.estimator.inputs.pandas_input_fn(
    x = trainDFPlayStore,
    y= trainDFPlayStore['Rating'],
    batch_size = 128,
    num_epochs = 100,
    shuffle = True,
    num_threads=1
)

testInputFunction = tf.estimator.inputs.pandas_input_fn(
    x = testDFPlayStore,
    y = None,
    batch_size = 128,
    shuffle = False,
    num_threads = 1
)

ratingT = tf.feature_column.numeric_column('Rating')
reviewsT = tf.feature_column.numeric_column('ReviewsFloat')
installsT = tf.feature_column.indicator_column('Installs')

In [13]:
# SECTION: Train a Linear Model
OUTDIR = 'Trained-Linear'
shutil.rmtree(OUTDIR, ignore_errors = True)

model = tf.estimator.LinearRegressor(
      feature_columns = [ratingT, reviewsT], model_dir = OUTDIR)

model.train(input_fn = trainInputFunction)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'Trained-Linear', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff16a7f9cc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
To construct input pipelines, use the `tf.da

<tensorflow_estimator.python.estimator.canned.linear.LinearRegressor at 0x7ff16a7f9ba8>

In [14]:
# Train a DNN Regressor
OUTDIR = 'Trained-DNN'
shutil.rmtree(OUTDIR, ignore_errors = True)

Estimator_DNN = tf.estimator.DNNRegressor(
    feature_columns = [ratingT, reviewsT],
    model_dir = OUTDIR,
    hidden_units=[8, 4]
)

Estimator_DNN.train(input_fn = trainInputFunction)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'Trained-DNN', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff16815d860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was fin

<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor at 0x7ff16815d6a0>

In [15]:
# TO DO:
# 1. Create logic to split dataset:
#    a. It should not be split on Hash value
#    b. Training/Evaluation and Testing datasets should have values from all the Hashes
#    c. group by hashes, take count of reviews, and put 70% of the records in training datset, rest in testing dataset
# 2. Create TF Records
# 3. Create a Dense network and feed the two features (Reviews, One-hot-encoded installs) against label (Rating)