In [28]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

class EarthquakePredictor:
    def __init__(self, data_path):
        self.earthquake_data = pd.read_csv(data_path)
        self.features = ['Latitude', 'Longitude']
        self.target = 'Magnitude'
        self.X_train, self.X_test, self.y_train, self.y_test = self._prepare_data()
        self.trained_regressor = None

    def _prepare_data(self):
        X = self.earthquake_data[self.features].to_numpy()
        y = self.earthquake_data[self.target].to_numpy()
        return train_test_split(X, y, test_size=0.2, random_state=12345)

    def train_linear_regression(self):
        regressor = LinearRegression()
        self._train_and_evaluate(regressor, 'Linear Regression')

    def train_random_forest(self, n_estimators, random_state, model_name):
        regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        self._train_and_evaluate(regressor, model_name)

    def _train_and_evaluate(self, regressor, model_name):
        regressor.fit(self.X_train, self.y_train)
        self.trained_regressor = regressor

        # Save the trained model
        with open(f'{model_name.lower().replace(" ", "_")}_regressor.pickle', 'wb') as handle:
            pickle.dump(regressor, handle)

        # Generate predictions
        predictions = regressor.predict(self.X_test)

        # Evaluate the model
        rmse = metrics.mean_squared_error(predictions, self.y_test, squared=False)
        mae = metrics.mean_absolute_error(predictions, self.y_test)

        print(f'{model_name}: RMSE = {rmse}, MAE = {mae}')

    def manual_prediction_test(self, test_data):
        if self.trained_regressor is not None:
            predicted_magnitude = self.trained_regressor.predict(test_data)
            print("Predicted Magnitude:", predicted_magnitude)
        else:
            print("Model not trained. Please train a model first.")

    def _load_model(self, model_name):
        try:
            with open(f'{model_name.lower().replace(" ", "_")}_regressor.pickle', 'rb') as handle:
                return pickle.load(handle)
        except FileNotFoundError:
            return None

    def compare_models(self):
        # Compare first two models
        df_compare_first_two = pd.DataFrame(columns=['Linear Regression', 'Random Forest (Default)'])

        for model_name in df_compare_first_two.columns:
            regressor = self._load_model(model_name)
            if regressor is not None:
                # Generate predictions
                predictions = regressor.predict(self.X_test)

                # Evaluate the model
                rmse = metrics.mean_squared_error(predictions, self.y_test, squared=False)
                mae = metrics.mean_absolute_error(predictions, self.y_test)

                df_compare_first_two[model_name] = [rmse, mae]
            else:
                print(f"Model not found: {model_name}")

        # Compare all three models
        df_compare_all = pd.DataFrame(columns=['Linear Regression', 'Random Forest (Default)', 'Random Forest (Adjusted)'])

        for model_name in df_compare_all.columns:
            regressor = self._load_model(model_name)
            if regressor is not None:
                # Generate predictions
                predictions = regressor.predict(self.X_test)

                # Evaluate the model
                rmse = metrics.mean_squared_error(predictions, self.y_test, squared=False)
                mae = metrics.mean_absolute_error(predictions, self.y_test)

                df_compare_all[model_name] = [rmse, mae]
            else:
                print(f"Model not found: {model_name}")

        return df_compare_first_two, df_compare_all

    def display_summary_statistics(self):
        print(self.earthquake_data.describe())
    
    def display_whole_tab(self):
        print(self.earthquake_data.head(-40))

    def display_info(self):
        print(self.earthquake_data.info())

    def check_missing_values(self):
        print(self.earthquake_data.isnull().values.any())

    def display_data_distribution(self):
        self.earthquake_data['Magnitude'].plot.kde()

    def remove_not_needed_columns(self, columns_to_remove):
        self.earthquake_data = self.earthquake_data.drop(columns=columns_to_remove, errors='ignore')

    def display_box_plots(self):
        for column in ['Latitude', 'Longitude', 'Depth', 'Magnitude']:
            self.earthquake_data[column].plot(kind='box')
            plt.show()

    def display_correlation_matrix(self):
        numeric_columns = self.earthquake_data.select_dtypes(include=['float64']).columns
        correlations = self.earthquake_data[numeric_columns].corr()

        fig, ax = plt.subplots(figsize=(12, 12))
        colormap = sns.color_palette("BrBG", 12)

        sns.heatmap(correlations,
                    cmap=colormap,
                    annot=True,
                    fmt=".2f",
                    ax=ax)

        ax.set_yticklabels(correlations.columns)
        plt.show()
