In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv('first_data.csv')

# Label encode categorical columns
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Content_Rating'] = label_encoder.fit_transform(df['Content_Rating'])
df['Genres'] = label_encoder.fit_transform(df['Genres'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Android_Ver'] = label_encoder.fit_transform(df['Android_Ver'])
df['Installs_category'] = label_encoder.fit_transform(df['Installs_category'])

# Choose features and label
x = df.drop(['Installs_category', 'App', 'Last_Updated', 'Rating', 'Mean_App_Sentiment', 'Reviews'], axis=1)
y = df['Installs_category']

# Splitting the dataset into training and other sets (80/20)
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.2, random_state=28)

# Splitting the dataset into testing and validation sets (50/50)
x_test, x_validate, y_test, y_validate = train_test_split(x_other, y_other, test_size=0.5, random_state=28)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50,75, 100,120, 200],
    'max_depth': [4,6,8],  # Adjust max_depth for XGBoost
    'learning_rate': [0.01,0.02,0.03, 0.1, 0.2]  # Add learning_rate for XGBoost
}

# Initialize GridSearchCV with XGBClassifier
grid_search = GridSearchCV(estimator=XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_)), 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=3, 
                           n_jobs=-1)

# Fit GridSearchCV on the validation set
grid_search.fit(x_validate, y_validate)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Predicting using the validation set
y_validate_pred = best_model.predict(x_validate)

# Calculate RMSE for validation set
rmse_validate = np.sqrt(mean_squared_error(y_validate, y_validate_pred))
print(f"Validation RMSE: {rmse_validate:.3f}")

# Predicting using the test set
y_test_pred = best_model.predict(x_test)

# Calculate accuracy on the validation set (assuming classification problem)
score = best_model.score(x_validate, y_validate)
print(f"Validation Accuracy: {score:.3f}")

# Calculate RMSE for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.3f}")




Best Parameters: {'learning_rate': 0.03, 'max_depth': 4, 'n_estimators': 100}
Validation RMSE: 1.901
Validation Accuracy: 0.655
Test RMSE: 2.343


In [2]:
# Load dataset
df = pd.read_csv('second_data.csv')

# Label encode categorical columns
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Content_Rating'] = label_encoder.fit_transform(df['Content_Rating'])
df['Genres'] = label_encoder.fit_transform(df['Genres'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Android_Ver'] = label_encoder.fit_transform(df['Android_Ver'])
df['Installs_category'] = label_encoder.fit_transform(df['Installs_category'])

# Choose features and label
x = df.drop(['Installs_category', 'App', 'Last_Updated', 'Rating', 'Mean_App_Sentiment', 'Reviews'], axis=1)
y = df['Installs_category']

# Splitting the dataset into training and other sets (80/20)
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.2, random_state=28)

# Splitting the dataset into testing and validation sets (50/50)
x_test, x_validate, y_test, y_validate = train_test_split(x_other, y_other, test_size=0.5, random_state=28)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50,75, 100,120, 200],
    'max_depth': [4,6,8],  # Adjust max_depth for XGBoost
    'learning_rate': [0.01,0.02,0.03, 0.1, 0.2]  # Add learning_rate for XGBoost
}

# Initialize GridSearchCV with XGBClassifier
grid_search = GridSearchCV(estimator=XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_)), 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=3, 
                           n_jobs=-1)

# Fit GridSearchCV on the validation set
grid_search.fit(x_validate, y_validate)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Predicting using the validation set
y_validate_pred = best_model.predict(x_validate)

# Calculate RMSE for validation set
rmse_validate = np.sqrt(mean_squared_error(y_validate, y_validate_pred))
print(f"Validation RMSE: {rmse_validate:.3f}")

# Predicting using the test set
y_test_pred = best_model.predict(x_test)

# Calculate accuracy on the validation set (assuming classification problem)
score = best_model.score(x_validate, y_validate)
print(f"Validation Accuracy: {score:.3f}")

# Calculate RMSE for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.3f}")



Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 120}
Validation RMSE: 2.239
Validation Accuracy: 0.498
Test RMSE: 2.496


In [3]:
# Load dataset
df = pd.read_csv('finaldataset.csv')

# Label encode categorical columns
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Content_Rating'] = label_encoder.fit_transform(df['Content_Rating'])
df['Genres'] = label_encoder.fit_transform(df['Genres'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Android_Ver'] = label_encoder.fit_transform(df['Android_Ver'])
df['Installs_category'] = label_encoder.fit_transform(df['Installs_category'])

# Choose features and label
x = df.drop(['Installs_category', 'App', 'Last_Updated', 'Rating', 'Mean_App_Sentiment', 'Reviews'], axis=1)
y = df['Installs_category']

# Splitting the dataset into training and other sets (80/20)
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.2, random_state=28)

# Splitting the dataset into testing and validation sets (50/50)
x_test, x_validate, y_test, y_validate = train_test_split(x_other, y_other, test_size=0.5, random_state=28)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50,75, 100,120, 200],
    'max_depth': [4,6,8],  # Adjust max_depth for XGBoost
    'learning_rate': [0.01,0.02,0.03, 0.1, 0.2]  # Add learning_rate for XGBoost
}

# Initialize GridSearchCV with XGBClassifier
grid_search = GridSearchCV(estimator=XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_)), 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=3, 
                           n_jobs=-1)

# Fit GridSearchCV on the validation set
grid_search.fit(x_validate, y_validate)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Predicting using the validation set
y_validate_pred = best_model.predict(x_validate)

# Calculate RMSE for validation set
rmse_validate = np.sqrt(mean_squared_error(y_validate, y_validate_pred))
print(f"Validation RMSE: {rmse_validate:.3f}")

# Predicting using the test set
y_test_pred = best_model.predict(x_test)

# Calculate accuracy on the validation set (assuming classification problem)
score = best_model.score(x_validate, y_validate)
print(f"Validation Accuracy: {score:.3f}")

# Calculate RMSE for test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse_test:.3f}")



Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 120}
Validation RMSE: 2.161
Validation Accuracy: 0.447
Test RMSE: 2.519
