In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("dataset/googleplaystore.csv")

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df.drop(['App','Reviews', 'Installs','Type', 'Genres'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,Category,Rating,Size,Price,Content Rating,Last Updated,Current Ver,Android Ver
0,ART_AND_DESIGN,4.1,19M,0,Everyone,"January 7, 2018",1.0.0,4.0.3 and up
1,ART_AND_DESIGN,3.9,14M,0,Everyone,"January 15, 2018",2.0.0,4.0.3 and up
2,ART_AND_DESIGN,4.7,8.7M,0,Everyone,"August 1, 2018",1.2.4,4.0.3 and up
3,ART_AND_DESIGN,4.5,25M,0,Teen,"June 8, 2018",Varies with device,4.2 and up
4,ART_AND_DESIGN,4.3,2.8M,0,Everyone,"June 20, 2018",1.1,4.4 and up


In [6]:
value_counts = df['Category'].value_counts()
print(value_counts)

Category
FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
LIBRARIES_AND_DEMO       85
AUTO_AND_VEHICLES        85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
PARENTING                60
COMICS                   60
BEAUTY                   53
1.9                       1
Name: count, dtype: int64


In [7]:
df = df[df['Category'] != '1.9']
value_counts = df['Category'].value_counts()

In [8]:
le = LabelEncoder()
le.fit(df['Category'])
df['Category_Encoded'] = le.transform(df['Category'])
print(df[['Category', 'Category_Encoded']])


                  Category  Category_Encoded
0           ART_AND_DESIGN                 0
1           ART_AND_DESIGN                 0
2           ART_AND_DESIGN                 0
3           ART_AND_DESIGN                 0
4           ART_AND_DESIGN                 0
...                    ...               ...
10836               FAMILY                11
10837               FAMILY                11
10838              MEDICAL                20
10839  BOOKS_AND_REFERENCE                 3
10840            LIFESTYLE                18

[10840 rows x 2 columns]


In [9]:
num_missing = df['Rating'].isna().sum()

print("Number of missing values in 'Rating':", num_missing)
df.shape

Number of missing values in 'Rating': 1474


(10840, 9)

In [10]:
df.dropna(subset=['Rating'], inplace=True)
df.shape

(9366, 9)

In [11]:
df['Rating'].value_counts()

Rating
4.4    1109
4.3    1076
4.5    1038
4.2     952
4.6     823
4.1     708
4.0     568
4.7     499
3.9     386
3.8     303
5.0     274
3.7     239
4.8     234
3.6     174
3.5     163
3.4     128
3.3     102
4.9      87
3.0      83
3.1      69
3.2      64
2.9      45
2.8      42
2.7      25
2.6      25
2.5      21
2.3      20
2.4      19
1.0      16
2.2      14
1.9      13
2.0      12
1.7       8
1.8       8
2.1       8
1.6       4
1.4       3
1.5       3
1.2       1
Name: count, dtype: int64

In [12]:
print(df['Size'].value_counts())
num_missing = df['Size'].isna().sum()

print("Number of missing values in 'Size':", num_missing)

Size
Varies with device    1637
14M                    166
12M                    161
11M                    160
15M                    159
                      ... 
383k                     1
454k                     1
812k                     1
442k                     1
619k                     1
Name: count, Length: 414, dtype: int64
Number of missing values in 'Size': 0


In [13]:
letters_only = df['Size'].str.extract(r'([A-Za-z]+)')

letter_counts = letters_only[0].value_counts()

print(letter_counts)

0
M         7471
Varies    1637
k          258
Name: count, dtype: int64


In [15]:
def multiply_size(size_str):
    if size_str[-1] == 'M':
        return float(size_str[:-1]) * 1000000
    elif size_str[-1] == 'k':
        return float(size_str[:-1]) * 1000
    else:
        return float('NaN')

df['Size'] = df['Size'].apply(multiply_size)

In [16]:
df['Size']

0        19000000.0
1        14000000.0
2         8700000.0
3        25000000.0
4         2800000.0
            ...    
10834     2600000.0
10836    53000000.0
10837     3600000.0
10839           NaN
10840    19000000.0
Name: Size, Length: 9366, dtype: float64

In [23]:
df['Price'] = df['Price'].str.replace('$', '').astype(float)

print(df['Price'].value_counts())
num_missing = df['Price'].isna().sum()

print("Number of missing values in 'Price':", num_missing)

AttributeError: Can only use .str accessor with string values!