# Import packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np

# 1. Prepare and merge the datasets

In [None]:
df_google_first = pd.read_csv('app_sdk/google_data_first.csv', encoding='ISO-8859-1')
df_google_first = df_google_first.loc[:, ~df_google_first.columns.str.contains('^Unnamed')]

In [None]:
df_google_last = pd.read_csv('app_sdk/google_data_last.csv', encoding='ISO-8859-1')
df_google_last = df_google_last.loc[:, ~df_google_last.columns.str.contains('^Unnamed')]

In [None]:
missing_rows = df_google_first[~df_google_first['my_app_id'].isin(df_google_last['my_app_id'])]

df = pd.concat([df_google_last, missing_rows], ignore_index=True)

# 2. Analyze dataframe

In [None]:
def dataframe_summary(df):
    print(f"DataFrame contains {df.shape[0]} rows and {df.shape[1]} columns.\n")
    df_info = pd.DataFrame({
        'Data Type': df.dtypes,
        'Non-null Count': df.count(),
        'Null Count': df.isnull().sum(),
        'Null Percentage (%)': (df.isnull().sum() / len(df)) * 100
    })
    print(df_info)

# Run the function to get an enhanced summary of the DataFrame
# dataframe_summary(df)

# 2. Preprocess DataFrame

In [None]:
# Use specific columns
df = df[['my_app_id', 'num_downloads', 'rating_app', 'nb_rating', 'price_gplay', 'in_app', 'content_rating_app', 'categ_app', 'date_published', 'has_ads', 'family_library', 'developer_name', 'developer_info']]

## 2.1 Classify Apps into "Free", "Paid", "Freemium"

In [None]:
# Drop rows where no numeric part is found (if desired)
df = df.dropna(subset=['price_gplay'])

In [None]:
# List of currency symbols and codes to match, including €
currency_codes = ["€", "£", "$", "USD", "CZK", "EUR", "RUB", "AUD", "GBP", "JPY", "CAD", "KRW", "INR", "AED", 
                  "DKK", "SAR", "PLN", "UAH", "SEK", "IDR", "TRY", "TWD", "ILS", "HKD", "BGN", "MXN", "MYR", 
                  "BRL", "CLP", "CHF", ""]

# Create a regex pattern to match the symbols and codes
currency_pattern = r'|'.join(map(re.escape, currency_codes))

df_filtered = df.copy()

# Remove currency symbols and codes
df_filtered['price_value'] = df_filtered['price_gplay'].apply(lambda x: re.sub(currency_pattern, '', x).strip() if isinstance(x, str) else None)

df_filtered['price_value'] = df_filtered['price_value'].str.replace(',', '.')

# remove initial-scale=1. minimum-scale " hre
df_filtered['price_value'] = df_filtered['price_value'].str.replace('initial-scale=1. minimum-scale " hre', '')

# remove empty strings
df_filtered = df_filtered[df_filtered['price_value'] != '']

# convert to float
df_filtered['price_value'] = df_filtered['price_value'].astype(float)

In [None]:
# # Categorize apps based on 'price_value'
# df_filtered['price_category'] = df_filtered['price_value'].apply(lambda x: "free" if x == 0 else "paid")

# # Identify freemium apps where in_app == 1 and price_category == "free"
# df_filtered['price_category'] = df_filtered.apply(lambda row: "freemium" if row['price_category'] == "free" and row['in_app'] == 1 else row['price_category'], axis=1)

# Separate paid, free, and freemium apps
# paid_apps = df_filtered[df_filtered['price_category'] == "paid"]
# free_apps = df_filtered[df_filtered['price_category'] == "free"]
# freemium_apps = df_filtered[df_filtered['price_category'] == "freemium"]

In [None]:
# convert in_app and has_ads to boolean
df_filtered['in_app'] = df_filtered['in_app'].astype(bool)
df_filtered['has_ads'] = df_filtered['has_ads'].astype(bool)

In [None]:
# level 0 - Free
df_level0 = df_filtered[
    (df_filtered['price_value'] == 0) &
    (df_filtered['in_app'] == 0) &
    (df_filtered['has_ads'] == 0)
]

# level 1 - In-app advertising
df_level1 = df_filtered[
    (df_filtered['price_value'] == 0) &
    (df_filtered['in_app'] == 0) &
    (df_filtered['has_ads'] == 1)
]

# level 2 - Sample & premium (HARD).
# For example Nova Launcher and Nova Launcher Prime.

# level 3 - Freemium
df_level3 = df_filtered[
    (df_filtered['price_value'] == 0) &
    (df_filtered['in_app'] == 1) &
    (df_filtered['has_ads'] == 1)
]

# level 4 - Semi-premium
df_level4 = df_filtered[
    (df_filtered['price_value'] > 0) &
    (df_filtered['in_app'] == 1) &
    (df_filtered['has_ads'] == 0)
]

# level 5 - Premium
df_level5 = df_filtered[
    (df_filtered['price_value'] > 0) &
    (df_filtered['in_app'] == 0) &
    (df_filtered['has_ads'] == 0)
]


# Questions/remarks
# Is level 4 without ads?
# You need to identify two apps: 1 is free with ads, and the other is paid with no in-app purchases and ads.


In [None]:
df_free_with_possible_ads = df_filtered[
    (df_filtered['price_value'] == 0) &
    (df_filtered['in_app'] == 0) &
    (df_filtered['has_ads'].isin([1, 0]))
]

df_premium = df_filtered[
    (df_filtered['price_value'] > 0) &
    (df_filtered['in_app'].isin([1, 0])) &
    (df_filtered['has_ads'] == 0)
]

In [None]:
# Find common developer names
common_developers = pd.merge(
    df_free_with_possible_ads[['developer_name']],
    df_premium[['developer_name']],
    on='developer_name',
    how='inner'
)['developer_name'].unique()

# Filter both DataFrames to keep only rows with common developers
df_free_with_possible_ads_filtered = df_free_with_possible_ads[
    df_free_with_possible_ads['developer_name'].isin(common_developers)
]

df_premium_filtered = df_premium[
    df_premium['developer_name'].isin(common_developers)
]


In [None]:
# Sample lists
list1 = df_premium_filtered['my_app_id'].tolist()
list2 = df_free_with_possible_ads_filtered['my_app_id'].tolist()

# Function to find similar words with at least 75% similarity
def find_similar_words(list1, list2, threshold=75):
    # Combine both lists for TF-IDF feature extraction
    combined_list = list1 + list2

    # Use TF-IDF Vectorizer to transform the text into feature vectors
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))  # You can change ngram_range for more efficiency
    tfidf_matrix = vectorizer.fit_transform(combined_list)

    # Separate the TF-IDF vectors for both lists
    tfidf_list1 = tfidf_matrix[:len(list1)]
    tfidf_list2 = tfidf_matrix[len(list1):]

    # Use NearestNeighbors for finding approximate matches
    # Set the number of neighbors to a minimum of 10 or the size of list2 to limit comparisons
    n_neighbors = min(10, len(list2))
    nn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', algorithm='brute', n_jobs=-1)
    nn.fit(tfidf_list2)

    similar_pairs = []
    distances, indices = nn.kneighbors(tfidf_list1)

    # Iterate through the results to find pairs with similarity above the threshold
    for i, (dists, idxs) in enumerate(zip(distances, indices)):
        for dist, idx in zip(dists, idxs):
            similarity = (1 - dist) * 100
            if similarity >= threshold:
                similar_pairs.append((list1[i], list2[idx], similarity))

    # Create a DataFrame from the similar pairs
    df = pd.DataFrame(similar_pairs, columns=['Word from List 1', 'Word from List 2', 'Similarity (%)'])
    return df

# Run the function on the slices
similarities_df = find_similar_words(list1, list2)


In [None]:
def get_simple_string_difference(str1, str2):
    diff = []
    for i, (ch1, ch2) in enumerate(zip(str1, str2)):
        if ch1 != ch2:
            diff.append(f"({ch1}->{ch2})")
        else:
            diff.append(ch1)
    # Add remaining characters if strings are of unequal length
    if len(str1) > len(str2):
        diff.append(f"(-{str1[len(str2):]})")
    elif len(str2) > len(str1):
        diff.append(f"(+{str2[len(str1):]})")
    return ''.join(diff)

similarities_df['Difference Score'] = similarities_df.apply(lambda row: get_simple_string_difference(row['Word from List 1'].lower(), row['Word from List 2'].lower()), axis=1)

In [None]:
similarities_df['First_Word_Column1'] = similarities_df['Word from List 1'].str.split('.').str[0]
similarities_df['First_Word_Column2'] = similarities_df['Word from List 2'].str.split('.').str[0]

In [None]:
similarities_df['Comparison_Result'] = similarities_df['First_Word_Column1'] == similarities_df['First_Word_Column2']

In [None]:
similarities_df = similarities_df[similarities_df['Comparison_Result'] == True]

In [None]:
similarities_df

In [None]:
total_words = similarities_df['Word from List 1'].to_list() + similarities_df['Word from List 2'].to_list()

In [None]:
df_premium_filtered = df_premium_filtered.reset_index(drop=True)
df_free_with_possible_ads_filtered = df_free_with_possible_ads_filtered.reset_index(drop=True)
df_total = pd.concat([df_premium_filtered, df_free_with_possible_ads_filtered])

In [None]:
df_total = df_total.reset_index(drop=True)
df_total_filtered = df_total[df_total['my_app_id'].isin(set(total_words))]

In [None]:
df_level2 = df_total_filtered

In [None]:
# print the length of each level
print(f"Level 0: {len(df_level0)}")
print(f"Level 1: {len(df_level1)}")
print(f"Level 2: {len(df_level2)}")
print(f"Level 3: {len(df_level3)}")
print(f"Level 4: {len(df_level4)}")
print(f"Level 5: {len(df_level5)}")

## 2.2 Number of Downloads

In [None]:
# Function to clean and convert to numbers
def convert_to_numeric(value):
    if pd.isna(value):
        return np.nan
    else:
        # Remove ',' and '+' and '>' and strip any whitespace
        cleaned_value = value.replace(',', '').replace('+', '').replace('>', '').strip()
        return int(cleaned_value)

# Apply the function to the DataFrame column
df_filtered['num_downloads'] = df_filtered['num_downloads'].apply(convert_to_numeric)

## 2.3 Ratings

In [None]:
df_filtered = df_filtered.dropna()

In [None]:
# convert 76,545 to 76545
df_filtered['nb_rating'] = df_filtered['nb_rating'].str.replace(',', '')

df_filtered['nb_rating'] = pd.to_numeric(df_filtered['nb_rating'])
df_filtered['rating_app'] = pd.to_numeric(df_filtered['rating_app'])

# Calculate the global average rating across all apps
global_mean_rating = df_filtered['rating_app'].mean()

# Define a prior weight (m)
# This is the number of ratings at which the average rating is considered reliable.
# Adjust based on your data; higher values give more weight to the global mean.
m = 50

# Calculate Bayesian average for each app
df_filtered['bayesian_average'] = (global_mean_rating * m + df_filtered['rating_app'] * df_filtered['nb_rating']) / (m + df_filtered['nb_rating'])


## 2.4 Content Rating

In [None]:
# Applying the regex matching approach to the 'content_rating_app' column in df_content
df_filtered["content_rating_app"] = [
    match.group() if (match := re.match(r'PEGI (3|7|12|16|18)', item)) else item 
    for item in df_filtered["content_rating_app"]
]

# if content_rating_app contains 'not yet been rated', set to 'not yet been rated'
# Ensure the column is of type 'object' (which can store both strings and integers)
df_filtered['content_rating_app'] = df_filtered['content_rating_app'].astype('object')

# Now you can safely assign the string value
df_filtered.loc[df_filtered['content_rating_app'].str.contains('not yet been rated'), 'content_rating_app'] = 'not yet been rated'

## 2.5 Dates

In [None]:
df_filtered['date_published'] = pd.to_datetime(df_filtered['date_published'], errors='coerce')

# 3. Graphs

## 3.1 Main observations

### 3.1.1 Date Published

In [None]:
# histogram when apps are published
df_filtered['date_published'].hist(bins=20, figsize=(10, 6))
plt.xlabel('Date Published')
plt.ylabel('Frequency')
plt.title('Distribution of Date Published')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

### 3.1.2 Rating

#### 3.1.2.1 Distribution of Ratings

In [None]:
# Plot histogram
df_filtered['rating_app'].hist(bins=20, figsize=(10, 6))
plt.xlabel('App Rating')
plt.ylabel('Frequency')
plt.title('Distribution of App Rating')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

#### 3.1.2.2 Distribution of Bayesian Ratings

In [None]:
# Plot histogram
df_filtered['bayesian_average'].hist(bins=20, figsize=(10, 6))
plt.xlabel('Bayesian average App Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Bayesian average App Rating')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()

### 3.1.3 Number of downloads

In [None]:
# Define the bins for categorization
bins = [0, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, np.inf]
labels = ['0-100', '101-1k', '1k-10k', '10k-100k', '100k-1M', '1M-10M', '10M-100M', '100M-1B', '1B+']

# Categorize num_downloads into these bins
df_filtered['downloads_category'] = pd.cut(df_filtered['num_downloads'], bins=bins, labels=labels)

# Plot the distribution of num_downloads categories
plt.figure(figsize=(12, 6))
df_filtered['downloads_category'].value_counts(sort=False).plot(kind='bar', edgecolor='black')
plt.xlabel('Number of Downloads (Categories)')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Downloads by Categories')
plt.xticks(rotation=45)
plt.show()

### 3.1.4 WARNING: Prices (this is not adjusted to currencies)

In [None]:
plt.figure(figsize=(12, 8))
plt.hist(df_filtered['price_value'], bins=50, edgecolor="black", color="skyblue", range=(-1, max(df_filtered['price_value']) + 1))
plt.yscale("log")  # Log scale to better visualize frequencies

# Adding labels and title
plt.title("Distribution of Prices on Google Play")
plt.xlabel("Price")
plt.ylabel("Log Frequency")

plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### 3.1.5 Content Ratings

In [None]:
plt.figure(figsize=(8, 8))
content_rating_counts = df_filtered["content_rating_app"].value_counts(normalize=True) * 100  # percentage
content_rating_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title("Proportion of Different Content Ratings")
plt.ylabel("")  # Hide y-axis label for cleaner look
plt.show()

### 3.1.6 Category of apps

In [None]:
category_counts = df_filtered['categ_app'].value_counts()

plt.figure(figsize=(8, 8))

category_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, wedgeprops={'edgecolor': 'black'})

plt.title("Distribution of App Categories")

plt.ylabel("")  # Hide y-axis label for cleaner look

plt.show()

### 3.1.7 Apps with in-app purchases and ads

In [None]:
in_app_percentage = df_filtered["in_app"].value_counts(normalize=True) * 100
has_ads_percentage = df_filtered["has_ads"].value_counts(normalize=True) * 100

# Creating a DataFrame for plotting
percentage_data = pd.DataFrame({
    "Feature": ["In-app Purchases", "No In-app Purchases", "Has Ads", "No Ads"],
    "Percentage": [in_app_percentage[True], in_app_percentage[False], has_ads_percentage[True], has_ads_percentage[False]]
})



# Plotting the bar plot for percentages of apps with in-app purchases and ads
plt.figure(figsize=(10, 6))
plt.bar(percentage_data["Feature"], percentage_data["Percentage"], color=['skyblue', 'lightgreen', 'coral', 'gold'], edgecolor='black')
plt.title("Percentage of Apps with In-app Purchases and Ads")
plt.xlabel("Feature")
plt.ylabel("Percentage (%)")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### 3.1.8 Family Library Support

In [None]:
family_library_counts = df_filtered["family_library"].value_counts()

# Plotting the count of apps supporting or not supporting the family library feature
plt.figure(figsize=(8, 6))
family_library_counts.plot(kind='bar', color=['skyblue', 'coral'], edgecolor='black')
plt.title("Count of Apps Supporting Family Library Feature")
plt.xlabel("Family Library Support")
plt.ylabel("Number of Apps")
plt.xticks([0, 1], labels=["Does Not Support", "Supports"], rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### 3.1.9 Average app rating per category

In [None]:
# Splitting categories into two parts for better readability
unique_categories = df_filtered["categ_app"].unique()
midpoint = len(unique_categories) // 2
categories_part1 = unique_categories[:midpoint]
categories_part2 = unique_categories[midpoint:]

# Filtering data for each part
df_category_part1 = df_filtered[df_filtered["categ_app"].isin(categories_part1)]
df_category_part2 = df_filtered[df_filtered["categ_app"].isin(categories_part2)]

# Plotting box plot for average app ratings by category (Part 1)
plt.figure(figsize=(12, 6))
df_category_part1.boxplot(column="rating_app", by="categ_app", grid=False, vert=False)
plt.title("Average App Ratings by Category (Part 1)")
plt.suptitle("")  # Remove default title to keep it clean
plt.xlabel("Average Rating")
plt.ylabel("App Category")
plt.show()

# Plotting box plot for average app ratings by category (Part 2)
plt.figure(figsize=(12, 6))
df_category_part2.boxplot(column="rating_app", by="categ_app", grid=False, vert=False)
plt.title("Average App Ratings by Category (Part 2)")
plt.suptitle("")  # Remove default title to keep it clean
plt.xlabel("Average Rating")
plt.ylabel("App Category")
plt.show()


## 3.2 Topic 2: Freemium vs Paid-For Apps observations

### 3.2.1 Subway Surfers

In [None]:
df_filtered[df_filtered['my_app_id'] == 'com.kiloo.subwaysurf']


### 3.2.2 Downloads per price category

#### 3.2.2.1 Total downloads

In [None]:
# Calculate the total number of downloads for each price category
total_downloads = df_filtered.groupby('price_category')['num_downloads'].sum().reset_index()

# Set up a color palette for price categories
palette = {"free": "blue", "paid": "green", "freemium": "orange"}

# Plot the total number of downloads by price category with the updated parameters
plt.figure(figsize=(10, 6))
sns.barplot(data=total_downloads, x='price_category', y='num_downloads', hue='price_category', palette=palette, dodge=False)
plt.legend([], [], frameon=False)  # Remove the legend

# Add labels and a title
plt.title("Total Number of Downloads by Price Category")
plt.xlabel("Price Category")
plt.ylabel("Total Number of Downloads")
plt.show()


#### 3.2.2.1 Average downloads

In [None]:
# Calculate the mean number of downloads for each price category
mean_freemium = df_filtered[df_filtered['price_category'] == 'freemium']['num_downloads'].mean()
mean_free = df_filtered[df_filtered['price_category'] == 'free']['num_downloads'].mean()
mean_paid = df_filtered[df_filtered['price_category'] == 'paid']['num_downloads'].mean()


In [None]:
# Create a dictionary for the means
means = {
    'Free': mean_free,
    'Freemium': mean_freemium,
    'Paid': mean_paid,
}

# Convert to a pandas DataFrame for easier plotting
mean_df = pd.DataFrame(list(means.items()), columns=['Price Category', 'Average Downloads'])


In [None]:
# Set up a color palette for price categories
palette = {"Free": "blue", "Paid": "green", "Freemium": "orange"}

# Set up the plot
plt.figure(figsize=(8, 6))

# Create the bar plot with the updated parameters
sns.barplot(x='Price Category', y='Average Downloads', data=mean_df, hue='Price Category', palette=palette, dodge=False)
plt.legend([], [], frameon=False)  # Removes the legend

# Add labels and a title
plt.title('Average Number of Downloads by Price Category', fontsize=16)
plt.xlabel('Price Category', fontsize=12)
plt.ylabel('Average Number of Downloads', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()

### 3.2.3 Rating by Price Category

In [None]:
# Set up a color palette for price categories
palette = {"free": "blue", "paid": "green", "freemium": "orange"}

#### 3.2.3.1 Average Rating

In [None]:
# Average Rating by Price Category
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_filtered, x='price_category', y='rating_app', hue='price_category', palette=palette, legend=False)
plt.title("Average Rating by Price Category")
plt.xlabel("Price Category")
plt.ylabel("App Rating")
plt.show()


#### 3.2.3.2 Average Bayesian Rating

In [None]:
# Bayesian Average by Price Category
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_filtered, x='price_category', y='bayesian_average', hue='price_category', palette=palette, legend=False)
plt.title("Bayesian Average by Price Category")
plt.xlabel("Price Category")
plt.ylabel("Bayesian Average")
plt.show()


#### 3.2.3.3 Average Number of Ratings

In [None]:
# Number of Ratings by Price Category
plt.figure(figsize=(10, 6))
sns.barplot(data=df_filtered, x='price_category', y='nb_rating', hue='price_category', palette=palette, errorbar=None, legend=False)
plt.title("Number of Ratings by Price Category")
plt.xlabel("Price Category")
plt.ylabel("Number of Ratings")
plt.show()

#### 3.2.3.4 Total Number of Ratings

In [None]:
# Calculate the total number of ratings for each price category
total_ratings = df_filtered.groupby('price_category')['nb_rating'].sum().reset_index()

# Plot the total number of ratings by price category
plt.figure(figsize=(10, 6))
sns.barplot(data=total_ratings, x='price_category', y='nb_rating', hue='price_category', palette=palette, dodge=False)
plt.legend([], [], frameon=False)  # Remove the legend if not needed

# Add labels and a title
plt.title("Total Number of Ratings by Price Category")
plt.xlabel("Price Category")
plt.ylabel("Total Number of Ratings")
plt.show()

### 3.2.4 TODO: Outliers

### 3.2.5 TODO: Distribution of free, freemium, paid. Average downloads per category



# 4. Merge SDK dataset

In [None]:
# read app_sdk/app_sdk.csv
app_sdk = pd.read_csv('app_sdk/app_sdk.csv')
app_sdk = app_sdk.loc[:, ~app_sdk.columns.str.contains('^Unnamed')]

In [None]:
# join df and app_sdk on inner join with my_app_id and id
df_app_and_sdk = pd.merge(df_filtered, app_sdk, left_on='my_app_id', right_on='id', how='inner')

# 5. Graphs SDK

## 5.1 SDK_name counts

In [None]:
# Plotting the counts of each SDK name
plt.figure(figsize=(10, 6))
df_app_and_sdk['sdk_type'].value_counts().plot(kind='bar')
plt.title("Counts of Each SDK Type")
plt.xlabel("SDK Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5.2 Top 10 used SDKs

In [None]:
# top 10 most used SDKs by free apps
df_free = df_app_and_sdk[df_app_and_sdk['price_category'] == 'free']
df_free = df_free.dropna(subset=['sdk_name'])
df_free = df_free['sdk_name'].value_counts().head(10)
df_free

In [None]:
# top 10 most used SDKs by freemium apps
df_freemium = df_app_and_sdk[df_app_and_sdk['price_category'] == 'freemium']
df_freemium = df_freemium.dropna(subset=['sdk_name'])
df_freemium = df_freemium['sdk_name'].value_counts().head(10)
df_freemium

In [None]:
# top 10 most used SDKs by paid apps
df_paid = df_app_and_sdk[df_app_and_sdk['price_category'] == 'paid']
df_paid = df_paid.dropna(subset=['sdk_name'])
df_paid = df_paid['sdk_name'].value_counts().head(10)
df_paid

## 5.3 Top 10 used SDKs dropping duplicates

In [None]:
df_new = df_app_and_sdk.drop_duplicates(subset=['my_app_id', 'sdk_name'])

In [None]:
df_free = df_new[df_new['price_category'] == 'free']
df_freemium = df_new[df_new['price_category'] == 'freemium']
df_paid = df_new[df_new['price_category'] == 'paid']

In [None]:
df_free['sdk_name'].value_counts().head(10)

In [None]:
df_freemium['sdk_name'].value_counts().head(10)

In [None]:
df_paid['sdk_name'].value_counts().head(10)