In [211]:
import os
import numpy as np
import pandas as pd
from collections import Counter

### 0. Load Data

In [212]:
path_info = '../archive_googleplaystore/googleplaystore.csv'
path_review = '../archive_googleplaystore/googleplaystore_user_reviews.csv'

In [213]:
app_info = pd.read_csv(path_info, index_col='App')
app_review = pd.read_csv(path_review)

In [214]:
app_info.head()

Unnamed: 0_level_0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [215]:
app_review.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


### 1. Preprocessing


#### 1-1. drop nulls, columns , rows

In [216]:
# info data: not much null data(except rating)
app_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10841 entries, Photo Editor & Candy Camera & Grid & ScrapBook to iHoroscope - 2018 Daily Horoscope & Astrology
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        10841 non-null  object 
 1   Rating          9367 non-null   float64
 2   Reviews         10841 non-null  object 
 3   Size            10841 non-null  object 
 4   Installs        10841 non-null  object 
 5   Type            10840 non-null  object 
 6   Price           10841 non-null  object 
 7   Content Rating  10840 non-null  object 
 8   Genres          10841 non-null  object 
 9   Last Updated    10841 non-null  object 
 10  Current Ver     10833 non-null  object 
 11  Android Ver     10838 non-null  object 
dtypes: float64(1), object(11)
memory usage: 1.1+ MB


In [217]:
# review data: 행 별로 정보가 아예 있거나 없거나
app_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [218]:
# null 있는 행 제거
app_review.dropna(axis =0, inplace = True)

In [219]:
# Type, Android/Current Ver columns drop
app_info.drop(['Type', 'Android Ver', 'Current Ver'], axis =1, inplace =True)

In [220]:
# Strange Row data 
display(app_info[app_info['Reviews'] =='3.0M'])
# Drop row
app_info.drop(app_info[app_info['Reviews'] =='3.0M'].index, axis =0, inplace =True)

Unnamed: 0_level_0,Category,Rating,Reviews,Size,Installs,Price,Content Rating,Genres,Last Updated
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,Everyone,,"February 11, 2018",1.0.19


#### 1-2. String to Float

In [221]:
app_info.head()

Unnamed: 0_level_0,Category,Rating,Reviews,Size,Installs,Price,Content Rating,Genres,Last Updated
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",0,Everyone,Art & Design,"January 7, 2018"
Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",0,Everyone,Art & Design;Pretend Play,"January 15, 2018"
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",0,Everyone,Art & Design,"August 1, 2018"
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",0,Teen,Art & Design,"June 8, 2018"
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",0,Everyone,Art & Design;Creativity,"June 20, 2018"


In [222]:
app_info['Installs'].unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+', '0'], dtype=object)

In [223]:
def size_converter(x):
    if 'M' in x:
        return(float(x[:-1])*1000000)
    elif 'k' in x:
        return(float(x[:-1])*1000)
    else:
        return np.nan

def price_converter(x):
    if x != '0':
        return (float(x[1:]))
    else:
        return(0)
def install_converter(x):
    return float(x.replace('+','').replace(',',''))

In [224]:
app_info['Size']= app_info['Size'].apply(size_converter)
app_info['Price'] = app_info['Price'].apply(price_converter)
app_info['Installs'] = app_info['Installs'].apply(install_converter)

In [225]:
app_info

Unnamed: 0_level_0,Category,Rating,Reviews,Size,Installs,Price,Content Rating,Genres,Last Updated
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000.0,10000.0,0.0,Everyone,Art & Design,"January 7, 2018"
Coloring book moana,ART_AND_DESIGN,3.9,967,14000000.0,500000.0,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018"
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,8700000.0,5000000.0,0.0,Everyone,Art & Design,"August 1, 2018"
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000.0,50000000.0,0.0,Teen,Art & Design,"June 8, 2018"
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000.0,100000.0,0.0,Everyone,Art & Design;Creativity,"June 20, 2018"
...,...,...,...,...,...,...,...,...,...
Sya9a Maroc - FR,FAMILY,4.5,38,53000000.0,5000.0,0.0,Everyone,Education,"July 25, 2017"
Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3600000.0,100.0,0.0,Everyone,Education,"July 6, 2018"
Parkinson Exercices FR,MEDICAL,,3,9500000.0,1000.0,0.0,Everyone,Medical,"January 20, 2017"
The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000.0,0.0,Mature 17+,Books & Reference,"January 19, 2015"


In [227]:
app_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10840 entries, Photo Editor & Candy Camera & Grid & ScrapBook to iHoroscope - 2018 Daily Horoscope & Astrology
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        10840 non-null  object 
 1   Rating          9366 non-null   float64
 2   Reviews         10840 non-null  object 
 3   Size            9145 non-null   float64
 4   Installs        10840 non-null  float64
 5   Price           10840 non-null  float64
 6   Content Rating  10840 non-null  object 
 7   Genres          10840 non-null  object 
 8   Last Updated    10840 non-null  object 
dtypes: float64(4), object(5)
memory usage: 846.9+ KB


In [None]:
## 해야할 전처리!: rating, Size null 처리

In [230]:
app_info.to_csv(path_info[:-4]+'_preprocessed.csv')
app_review.to_csv(path_info[:-4]+'_preprocessed.csv')

In [None]:
a = 
path_info[:-4]+'_preprocessed.csv'