In [1]:
from sklearn.model_selection import GridSearchCV
# for calculating the cost function
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split  # for splitting the data
import numpy as np  # for array operations
import pandas as pd  # for working with DataFrames
import matplotlib.pyplot as plt  # for data visualization
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [2]:
# Define the parameter grid
param_grid = {
    #'criterion': ['entropy', 'gini'],
    'max_depth': [None, 6, 8, 10, 12, 18],
    'min_samples_leaf': [2, 3, 4, 6],
    'min_samples_split': [2, 3, 4, 6]
}
dt = DecisionTreeClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator = dt,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5)

df = pd.read_csv('first_data.csv')

label_encoder = LabelEncoder()

# Assuming 'category_column' is the column with categorical data
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Content_Rating'] = label_encoder.fit_transform(df['Content_Rating'])
df['Genres'] = label_encoder.fit_transform(df['Genres'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Android_Ver'] = label_encoder.fit_transform(df['Android_Ver'])
df['Installs_category'] = label_encoder.fit_transform(df['Installs_category'])



# choose features and label
x = df.drop(['Installs_category', 'App', 'Last_Updated', 'Rating', 'Mean_App_Sentiment', 'Reviews'], axis=1)  # Features
y = df['Installs_category']  # Target

# Splitting the dataset into training and other sets (80/20)
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.2, random_state=28)

# Splitting the dataset into testing and validation sets (50/50)
x_test, x_validate, y_test, y_validate = train_test_split(x_other, y_other, test_size=0.5, random_state=28)


# Fit GridSearchCV
grid_search.fit(x_validate, y_validate)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_


# Predicting using the validation set
y_validate_pred = best_model.predict(x_validate)

# Calculate RMSE for validation set
rmse_validate = np.sqrt(mean_squared_error(y_validate, y_validate_pred))
print(f"Validation RMSE: {rmse_validate:.3f}")

# Predicting using the test set
y_test_pred = best_model.predict(x_test)

score = best_model.score(x_validate, y_validate)
print(score)

# Calculate RMSE for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.3f}")



Best Parameters: {'max_depth': 8, 'min_samples_leaf': 6, 'min_samples_split': 2}
Validation RMSE: 2.317
0.45907473309608543
Test RMSE: 2.478


In [3]:
# Define the parameter grid
param_grid = {
    #'criterion': ['entropy', 'gini'],
    'max_depth': [None, 6, 8, 10, 12, 18],
    'min_samples_leaf': [2, 3, 4, 6],
    'min_samples_split': [2, 3, 4, 6]
}
dt = DecisionTreeClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator = dt,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5)

df = pd.read_csv('second_data.csv')

label_encoder = LabelEncoder()

# Assuming 'category_column' is the column with categorical data
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Content_Rating'] = label_encoder.fit_transform(df['Content_Rating'])
df['Genres'] = label_encoder.fit_transform(df['Genres'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Android_Ver'] = label_encoder.fit_transform(df['Android_Ver'])
df['Installs_category'] = label_encoder.fit_transform(df['Installs_category'])



# choose features and label
x = df.drop(['Installs_category', 'App', 'Last_Updated', 'Rating', 'Mean_App_Sentiment', 'Reviews'], axis=1)  # Features
y = df['Installs_category']  # Target

# Splitting the dataset into training and other sets (80/20)
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.2, random_state=28)

# Splitting the dataset into testing and validation sets (50/50)
x_test, x_validate, y_test, y_validate = train_test_split(x_other, y_other, test_size=0.5, random_state=28)


# Fit GridSearchCV
grid_search.fit(x_validate, y_validate)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_


# Predicting using the validation set
y_validate_pred = best_model.predict(x_validate)

# Calculate RMSE for validation set
rmse_validate = np.sqrt(mean_squared_error(y_validate, y_validate_pred))
print(f"Validation RMSE: {rmse_validate:.3f}")

# Predicting using the test set
y_test_pred = best_model.predict(x_test)

score = best_model.score(x_validate, y_validate)
print(score)

# Calculate RMSE for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.3f}")



Best Parameters: {'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 6}
Validation RMSE: 2.556
0.4306049822064057
Test RMSE: 2.634


In [5]:
# Define the parameter grid
param_grid = {
    #'criterion': ['entropy', 'gini'],
    'max_depth': [None, 6, 8, 10, 12, 18],
    'min_samples_leaf': [2, 3, 4, 6],
    'min_samples_split': [2, 3, 4, 6]
}
dt = DecisionTreeClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator = dt,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5)

df = pd.read_csv('finaldataset.csv')

label_encoder = LabelEncoder()

# Assuming 'category_column' is the column with categorical data
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Content_Rating'] = label_encoder.fit_transform(df['Content_Rating'])
df['Genres'] = label_encoder.fit_transform(df['Genres'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Android_Ver'] = label_encoder.fit_transform(df['Android_Ver'])
df['Installs_category'] = label_encoder.fit_transform(df['Installs_category'])



# choose features and label
x = df.drop(['Installs_category', 'App', 'Last_Updated', 'Rating', 'Mean_App_Sentiment', 'Reviews'], axis=1)  # Features
y = df['Installs_category']  # Target

# Splitting the dataset into training and other sets (80/20)
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.2, random_state=28)

# Splitting the dataset into testing and validation sets (50/50)
x_test, x_validate, y_test, y_validate = train_test_split(x_other, y_other, test_size=0.5, random_state=28)


# Fit GridSearchCV
grid_search.fit(x_validate, y_validate)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_


# Predicting using the validation set
y_validate_pred = best_model.predict(x_validate)

# Calculate RMSE for validation set
rmse_validate = np.sqrt(mean_squared_error(y_validate, y_validate_pred))
print(f"Validation RMSE: {rmse_validate:.3f}")

# Predicting using the test set
y_test_pred = best_model.predict(x_test)

score = best_model.score(x_validate, y_validate)
print(score)

# Calculate RMSE for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.3f}")



Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 6}
Validation RMSE: 2.131
0.4875444839857651
Test RMSE: 2.550
