In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import numpy as np

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
!ls '/content/drive/My Drive/EhealthData/googleplaystore.csv'

'/content/drive/My Drive/EhealthData/googleplaystore.csv'


In [67]:
data = pd.read_csv('/content/drive/My Drive/EhealthData/googleplaystore.csv')
data.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [None]:
print(data.shape)

(10841, 13)


First of all lets preprocess our data, for example find inconsistent values in each column

In [68]:
print(data.Category.unique())
data[data.Category == '1.9']
data = data.loc[data['Category'] != '1.9']

['ART_AND_DESIGN' 'AUTO_AND_VEHICLES' 'BEAUTY' 'BOOKS_AND_REFERENCE'
 'BUSINESS' 'COMICS' 'COMMUNICATION' 'DATING' 'EDUCATION' 'ENTERTAINMENT'
 'EVENTS' 'FINANCE' 'FOOD_AND_DRINK' 'HEALTH_AND_FITNESS' 'HOUSE_AND_HOME'
 'LIBRARIES_AND_DEMO' 'LIFESTYLE' 'GAME' 'FAMILY' 'MEDICAL' 'SOCIAL'
 'SHOPPING' 'PHOTOGRAPHY' 'SPORTS' 'TRAVEL_AND_LOCAL' 'TOOLS'
 'PERSONALIZATION' 'PRODUCTIVITY' 'PARENTING' 'WEATHER' 'VIDEO_PLAYERS'
 'NEWS_AND_MAGAZINES' 'MAPS_AND_NAVIGATION' '1.9']


In [69]:
print(data.Rating.unique())
data.query("Rating not in [4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.2, 4.6, 3.2, 4.]")
print(data['Rating'].isnull().sum()) # we have 1474 nan values, lets replace them with zeros
data['Rating'] = data['Rating'].fillna(0)
print(data['Rating'].isnull().sum())

[4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.2 4.6 3.2 4.  nan 4.8 4.9 3.6 3.7 3.3 3.4
 3.5 3.1 5.  2.6 3.  1.9 2.5 2.8 2.7 1.  2.9 2.3 2.2 1.7 2.  1.8 2.4 1.6
 2.1 1.4 1.5 1.2]
1474
0


In [70]:
print(data.Reviews.unique()) # a lot of values, lets use regex to find values which are not numbers
print(data['Reviews'].str.extract(r'(?P<letter>[a-z,A-Z])+')['letter'].unique()) # checked

['159' '967' '87510' ... '603' '1195' '398307']
[nan]


In [71]:
print(data.Type.unique())
data.query("Type not in ['Free','Paid']") # only one row, more than that has problems with other columns, just delete it
data = data[(data['Type'] == 'Free')|(data['Type'] == 'Paid')]
data

['Free' 'Paid' nan]


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,0.0,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [72]:
print(data['Content Rating'].unique())
data[data['Content Rating'] == 'Unrated'] # just two rows both of them was updated long time ago, we could delete them
data = data[(data['Content Rating'] != 'Unrated')]
data

['Everyone' 'Teen' 'Everyone 10+' 'Mature 17+' 'Adults only 18+' 'Unrated']


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,0.0,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [73]:
(data['Last Updated'].unique())
print(data['Reviews'].str.extract(r'.*(?P<letter>20)+.*'))
#data[data.Category == '1.9']
#data = data.loc[data['Category'] != '1.9']

      letter
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
...      ...
10836    NaN
10837    NaN
10838    NaN
10839    NaN
10840    NaN

[10837 rows x 1 columns]


Preprocessing for working with numerical values

In [74]:
print(data["Installs"].unique())
def make_installs(s):
  return int(s.replace('Free','0').replace('+','').replace(',',''))

data['Installs'] = data['Installs'].apply(make_installs)
print(data["Installs"].unique())

['10,000+' '500,000+' '5,000,000+' '50,000,000+' '100,000+' '50,000+'
 '1,000,000+' '10,000,000+' '5,000+' '100,000,000+' '1,000,000,000+'
 '1,000+' '500,000,000+' '50+' '100+' '500+' '10+' '1+' '5+' '0+']
[     10000     500000    5000000   50000000     100000      50000
    1000000   10000000       5000  100000000 1000000000       1000
  500000000         50        100        500         10          1
          5          0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [75]:
print(data['Size'].unique())
def make_size(s):
  morphed = s.replace('Varies with device','500000').replace('M','000000').replace('k','000')
  if (morphed.find('.') != -1):
    morphed = morphed[:-1]
    morphed = morphed.replace('.','')
  return int(morphed.replace('Free','0').replace('+','').replace(',',''))

data['Size'] = data['Size'].apply(make_size)
print(data['Size'].unique())

['19M' '14M' '8.7M' '25M' '2.8M' '5.6M' '29M' '33M' '3.1M' '28M' '12M'
 '20M' '21M' '37M' '2.7M' '5.5M' '17M' '39M' '31M' '4.2M' '7.0M' '23M'
 '6.0M' '6.1M' '4.6M' '9.2M' '5.2M' '11M' '24M' 'Varies with device'
 '9.4M' '15M' '10M' '1.2M' '26M' '8.0M' '7.9M' '56M' '57M' '35M' '54M'
 '201k' '3.6M' '5.7M' '8.6M' '2.4M' '27M' '2.5M' '16M' '3.4M' '8.9M'
 '3.9M' '2.9M' '38M' '32M' '5.4M' '18M' '1.1M' '2.2M' '4.5M' '9.8M' '52M'
 '9.0M' '6.7M' '30M' '2.6M' '7.1M' '3.7M' '22M' '7.4M' '6.4M' '3.2M'
 '8.2M' '9.9M' '4.9M' '9.5M' '5.0M' '5.9M' '13M' '73M' '6.8M' '3.5M'
 '4.0M' '2.3M' '7.2M' '2.1M' '42M' '7.3M' '9.1M' '55M' '23k' '6.5M' '1.5M'
 '7.5M' '51M' '41M' '48M' '8.5M' '46M' '8.3M' '4.3M' '4.7M' '3.3M' '40M'
 '7.8M' '8.8M' '6.6M' '5.1M' '61M' '66M' '79k' '8.4M' '118k' '44M' '695k'
 '1.6M' '6.2M' '18k' '53M' '1.4M' '3.0M' '5.8M' '3.8M' '9.6M' '45M' '63M'
 '49M' '77M' '4.4M' '4.8M' '70M' '6.9M' '9.3M' '10.0M' '8.1M' '36M' '84M'
 '97M' '2.0M' '1.9M' '1.8M' '5.3M' '47M' '556k' '526k' '76M' '7.6M'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [76]:
print(data['Price'].unique())
def make_price(s):
  return float(s.replace('Everyone','0').replace('$',''))
data['Price'] = data['Price'].apply(make_price) 
print(data['Price'].unique())

['0' '$4.99' '$3.99' '$6.99' '$1.49' '$2.99' '$7.99' '$5.99' '$3.49'
 '$1.99' '$9.99' '$7.49' '$0.99' '$9.00' '$5.49' '$10.00' '$24.99'
 '$11.99' '$79.99' '$16.99' '$14.99' '$1.00' '$29.99' '$12.99' '$2.49'
 '$10.99' '$1.50' '$19.99' '$15.99' '$33.99' '$74.99' '$39.99' '$3.95'
 '$4.49' '$1.70' '$8.99' '$2.00' '$3.88' '$25.99' '$399.99' '$17.99'
 '$400.00' '$3.02' '$1.76' '$4.84' '$4.77' '$1.61' '$2.50' '$1.59' '$6.49'
 '$1.29' '$5.00' '$13.99' '$299.99' '$379.99' '$37.99' '$18.99' '$389.99'
 '$19.90' '$8.49' '$1.75' '$14.00' '$4.85' '$46.99' '$109.99' '$154.99'
 '$3.08' '$2.59' '$4.80' '$1.96' '$19.40' '$3.90' '$4.59' '$15.46' '$3.04'
 '$4.29' '$2.60' '$3.28' '$4.60' '$28.99' '$2.95' '$2.90' '$1.97'
 '$200.00' '$89.99' '$2.56' '$30.99' '$3.61' '$394.99' '$1.26' '$1.20'
 '$1.04']
[  0.     4.99   3.99   6.99   1.49   2.99   7.99   5.99   3.49   1.99
   9.99   7.49   0.99   9.     5.49  10.    24.99  11.99  79.99  16.99
  14.99   1.    29.99  12.99   2.49  10.99   1.5   19.99  15.99  33.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [93]:
def make_price(s):
  return int(s)
data['Reviews'] = data['Reviews'].apply(make_price) 
print(data['Reviews'].unique())

[  0.     4.99   3.99   1.49   2.99   7.99   3.49   1.99   5.99   6.99
   9.99   7.49   0.99   1.     2.49  10.99   1.5   14.99  15.99  79.99
   9.    10.    16.99  11.99  29.99  74.99   5.49  33.99  24.99  39.99
  19.99   4.49   1.7    8.99   2.     3.88  25.99 399.99  17.99 400.
   3.02   1.76   4.84   4.77   1.61   2.5    1.59   6.49   1.29   5.
  13.99 299.99 379.99  37.99  18.99 389.99  19.9    8.49   1.75  14.
   4.85  46.99 109.99   3.95 154.99   3.08   2.59   4.8    1.96  19.4
   3.9    4.59  15.46   3.04  12.99   4.29   2.6    3.28   4.6   28.99
   2.95   2.9    1.97 200.    89.99   2.56  30.99   3.61 394.99   1.26
   1.2    1.04]


In [77]:
data.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000000,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700000,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


Lets check for duplication in data

In [78]:
print(data.shape)
print(data['App'].unique().__len__())
print(data[(data["App"]=="Duolingo: Learn Languages Free")])

(10837, 13)
9656
                                 App  ...         Android Ver
699   Duolingo: Learn Languages Free  ...  Varies with device
784   Duolingo: Learn Languages Free  ...  Varies with device
799   Duolingo: Learn Languages Free  ...  Varies with device
826   Duolingo: Learn Languages Free  ...  Varies with device
2056  Duolingo: Learn Languages Free  ...  Varies with device
2216  Duolingo: Learn Languages Free  ...  Varies with device
8439  Duolingo: Learn Languages Free  ...  Varies with device

[7 rows x 13 columns]


And it is weird because the same application has different categories (updated information), so lets choose the lattest(last) version. 

In [79]:
data = data.drop_duplicates(subset=['App'], keep="last")
print(data.shape)
print(data['App'].unique().__len__())
data[(data["App"]=="Duolingo: Learn Languages Free")]

(9656, 13)
9656


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8439,Duolingo: Learn Languages Free,FAMILY,4.7,6297590,500000,100000000,Free,0.0,Everyone,Education;Education,"August 6, 2018",Varies with device,Varies with device


In [80]:
data.Genres.unique()

array(['Art & Design', 'Art & Design;Creativity', 'Auto & Vehicles',
       'Beauty', 'Books & Reference', 'Business', 'Comics',
       'Comics;Creativity', 'Communication', 'Dating', 'Education',
       'Education;Creativity', 'Education;Education',
       'Education;Pretend Play', 'Education;Brain Games', 'Entertainment',
       'Entertainment;Brain Games', 'Entertainment;Creativity',
       'Entertainment;Music & Video', 'Events', 'Finance', 'Food & Drink',
       'Health & Fitness', 'House & Home', 'Libraries & Demo',
       'Lifestyle', 'Lifestyle;Pretend Play', 'Puzzle', 'Racing',
       'Sports', 'Casual', 'Arcade', 'Trivia', 'Simulation', 'Action',
       'Word', 'Role Playing', 'Strategy', 'Board',
       'Simulation;Education', 'Adventure', 'Card',
       'Action;Action & Adventure', 'Music', 'Casual;Brain Games',
       'Educational;Creativity', 'Puzzle;Brain Games',
       'Educational;Education', 'Card;Brain Games', 'Casual;Pretend Play',
       'Educational;Brain Games', 

As we see application may have at most rwo categories, lo lets split them into two columns with ';' as a delimeter 

In [81]:
data[["First genre", "Second genre"]]= data["Genres"].str.split(";", n = 1, expand = True) 
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,First genre,Second genre
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,Art & Design,
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700000,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,Art & Design,
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,Art & Design,
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,Art & Design,Creativity
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5600000,50000,Free,0.0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up,Art & Design,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53000000,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,Education,
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3600000,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,Education,
10838,Parkinson Exercices FR,MEDICAL,0.0,3,9500000,1000,Free,0.0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up,Medical,
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,500000,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,Books & Reference,


In [82]:
data = data.drop(columns=['Genres'])
print(data['First genre'].unique())
print(data['Second genre'].unique())

['Art & Design' 'Auto & Vehicles' 'Beauty' 'Books & Reference' 'Business'
 'Comics' 'Communication' 'Dating' 'Education' 'Entertainment' 'Events'
 'Finance' 'Food & Drink' 'Health & Fitness' 'House & Home'
 'Libraries & Demo' 'Lifestyle' 'Puzzle' 'Racing' 'Sports' 'Casual'
 'Arcade' 'Trivia' 'Simulation' 'Action' 'Word' 'Role Playing' 'Strategy'
 'Board' 'Adventure' 'Card' 'Music' 'Educational' 'Music & Audio'
 'Medical' 'Social' 'Shopping' 'Photography' 'Travel & Local' 'Tools'
 'Personalization' 'Productivity' 'Parenting' 'Weather'
 'Video Players & Editors' 'News & Magazines' 'Maps & Navigation' 'Casino']
[None 'Creativity' 'Education' 'Pretend Play' 'Brain Games'
 'Music & Video' 'Action & Adventure']


In [23]:
data

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated,Current Ver,Android Ver,First genre,Second genre
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000,10000,,0.0,Photo Editor & Candy Camera & Grid & ScrapBook,"January 7, 2018",1.0.0,4.0.3 and up,Art & Design,
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700000,5000000,,0.0,"U Launcher Lite – FREE Live Cool Themes, Hide ...","August 1, 2018",1.2.4,4.0.3 and up,Art & Design,
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000,50000000,,0.0,Sketch - Draw & Paint,"June 8, 2018",Varies with device,4.2 and up,Art & Design,
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000,100000,,0.0,Pixel Draw - Number Art Coloring Book,"June 20, 2018",1.1,4.4 and up,Art & Design,Creativity
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5600000,50000,,0.0,Paper flowers instructions,"March 26, 2017",1.0,2.3 and up,Art & Design,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53000000,5000,,0.0,Sya9a Maroc - FR,"July 25, 2017",1.48,4.1 and up,Education,
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3600000,100,,0.0,Fr. Mike Schmitz Audio Teachings,"July 6, 2018",1.0,4.1 and up,Education,
10838,Parkinson Exercices FR,MEDICAL,0.0,3,9500000,1000,,0.0,Parkinson Exercices FR,"January 20, 2017",1.0,2.2 and up,Medical,
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,500000,1000,,0.0,The SCP Foundation DB fr nn5n,"January 19, 2015",Varies with device,Varies with device,Books & Reference,


In [None]:
#data = pd.get_dummies(data, columns=["First genre", "Second genre"],prefix=['genre','genre'])
#data
#data[data['genre_Music & Audio']==1]
#data = data[data['genre_Music & Audio']!=1]

We can drop Music & Audio column as it has only one string

Lets choose as example of application "Duolingo: Learn Languages Free" , and find similar applications 

In [94]:
#data[(data["App"]=="Duolingo: Learn Languages Free")]
data[((data["Content Rating"]=="Everyone") & (data["Rating"]>=4.3) & (data['First genre']=='Education') & (data["Reviews"]>=3000) )]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated,Current Ver,Android Ver,First genre,Second genre
706,Learn Spanish - Español,EDUCATION,4.7,32346,3200000,1000000,Free,0.0,Everyone,"December 18, 2017",1.3.8,4.0 and up,Education,
708,Speed Reading,EDUCATION,4.6,10611,11000000,500000,Free,0.0,Everyone,"May 27, 2018",2.3.7,4.0 and up,Education,
709,English for beginners,EDUCATION,4.6,9321,27000000,1000000,Free,0.0,Everyone,"July 18, 2018",2.9.0,4.0 and up,Education,
710,Flame - درب عقلك يوميا,EDUCATION,4.6,56065,37000000,1000000,Free,0.0,Everyone,"July 26, 2018",3.3,4.1 and up,Education,
712,"Learn Japanese, Korean, Chinese Offline & Free",EDUCATION,4.9,133136,26000000,1000000,Free,0.0,Everyone,"July 20, 2018",2.16.11.10,4.2 and up,Education,Education
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9620,Hello English: Learn English,FAMILY,4.6,750321,500000,10000000,Free,0.0,Everyone,"August 7, 2018",Varies with device,Varies with device,Education,
9629,Learn English Daily,FAMILY,4.5,7881,53000000,500000,Free,0.0,Everyone,"May 6, 2018",7.0,4.0 and up,Education,
9639,Learn Top 300 English Words,FAMILY,4.7,61600,3400000,1000000,Free,0.0,Everyone,"February 24, 2018",3.1,2.3 and up,Education,
9667,Masha and the Bear,FAMILY,4.4,15105,36000000,1000000,Free,0.0,Everyone,"June 11, 2018",3.1.8,4.3 and up,Education,


Now we try to make a database of serious games, first of all choose filters, which allow to distinguish if the  is serious:
1) The rating must be higher than 4.4
2) It should have more than 10000 reviews
3) It should have more than 100000 downloads

In [96]:
serious_data = data[((data["Installs"]>=100000) & (data["Rating"]>=4.4)  & (data["Reviews"]>=10000) )]
serious_data.to_csv('/content/drive/My Drive/EhealthData/serious_data.csv', index=False)

In [None]:
plt.figure(figsize=(12, 12))
data["Installs"].hist()
#
#
#
#
#
#
lis = ['Casual', 'Card', 'Casual;Pretend Play', 'Action', 'Puzzle',
       'Arcade', 'Sports', 'Music', 'Word', 'Casual;Creativity', 'Racing',
       'Simulation', 'Adventure', 'Trivia', 'Board',
       'Simulation;Education', 'Role Playing',
       'Action;Action & Adventure', 'Casual;Brain Games',
       'Simulation;Action & Adventure', 'Casino']
print(lis.__len__())

21


substitute all strings with categorical and range values to use clustering algorithms

In [None]:
altered_data = data
altered_data = altered_data.replace(['Casual', 'Card', 'Casual;Pretend Play', 'Action', 'Puzzle',
       'Arcade', 'Sports', 'Music', 'Word', 'Casual;Creativity', 'Racing',
       'Simulation', 'Adventure', 'Trivia', 'Board',
       'Simulation;Education', 'Role Playing',
       'Action;Action & Adventure', 'Casual;Brain Games',
       'Simulation;Action & Adventure', 'Casino'], range(21)) 
altered_data = altered_data.replace()
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


X = data.drop(columns=['App'])

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

model = model.fit(X)
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()