<a href="https://colab.research.google.com/github/ibixina/ML/blob/main/spyData/tornSpyModels_battleScore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# import models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

# import mean squared error
from sklearn.metrics import mean_squared_error

In [16]:
df = pd.read_csv("/content/drive/MyDrive/spyData.csv")
df.head()

Unnamed: 0,Name,Id,Strength,Defense,Speed,Dexterity,Total,level,attackswon,attackslost,...,awards,nerverefills,tokenrefills,meritsbought,daysbeendonator,rankedwarringwins,arrestsmade,weaponsbought,dumpsearches,refills
0,Captive [750078],750078,2841781187,4520090189,2321866936,2361640385,12045378697,100,11617,268,...,560,1160,64,29,4544,72,0,0,0,3693
1,RockyRoxanne [2282703],2282703,510510020,500599390,512122429,424518439,1947750278,86,5633,379,...,417,123,0,43,1848,6,0,0,0,1437
2,MrTedz [2376881],2376881,1025064,1087395,1000677,43916900,47030036,52,1783,164,...,274,68,0,0,586,14,0,0,0,405
3,Grozzy [2224217],2224217,1130536203,1010448748,921764334,1000394259,4063143544,100,12880,708,...,575,1473,259,50,2303,35,0,0,0,1622
4,Elemak [2255978],2255978,1634461750,1010822390,1154327412,2185168934,5984780486,100,22460,1280,...,571,1405,273,50,2245,64,0,0,0,1645


In [17]:
# data pre processing
ignore_cols = ["Strength", "Defense", "Speed", "Dexterity", "Name", "Id"]

for col in ignore_cols:
  if col == "Name": continue
  df[col] = df[col].replace({",": ""}, regex=True).astype(int)

# add a new field
df["StatScore"] = (df["Strength"]**0.5 + df["Defense"]**0.5 + df["Speed"]**0.5 + df["Dexterity"]**0.5)

# convert total to int ignoring the commas
df["Total"] = df["Total"].replace({",": ""}, regex=True).astype(int)

print(df.head())

# drop ignore cols
df = df.drop(ignore_cols, axis=1)



STAT_UPPER = 10000000000

# # remove data if total is greater than STAT_UPPER
df = df[df["Total"] < STAT_UPPER]
# df = df[df["Total"] >= STAT_UPPER]

df.drop(["Total"], axis=1)

# y = df["Total"]
y = df["StatScore"]
X = df.drop("StatScore", axis=1)

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

numerical_features = X.select_dtypes(include=['number']).columns
print(numerical_features)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

                     Name       Id    Strength     Defense       Speed  \
0        Captive [750078]   750078  2841781187  4520090189  2321866936   
1  RockyRoxanne [2282703]  2282703   510510020   500599390   512122429   
2        MrTedz [2376881]  2376881     1025064     1087395     1000677   
3        Grozzy [2224217]  2224217  1130536203  1010448748   921764334   
4        Elemak [2255978]  2255978  1634461750  1010822390  1154327412   

    Dexterity        Total  level  attackswon  attackslost  ...  nerverefills  \
0  2361640385  12045378697    100       11617          268  ...          1160   
1   424518439   1947750278     86        5633          379  ...           123   
2    43916900     47030036     52        1783          164  ...            68   
3  1000394259   4063143544    100       12880          708  ...          1473   
4  2185168934   5984780486    100       22460         1280  ...          1405   

   tokenrefills  meritsbought  daysbeendonator  rankedwarringwins  \

((1043, 179), (261, 179), (1043,), (261,))

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# preprocessing pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

transformers_list = [('num', numerical_pipeline, numerical_features)]
preprocessor = ColumnTransformer(transformers=transformers_list, remainder='passthrough')


In [19]:
models = {
            'Linear Regression': LinearRegression(),
            'Ridge': Ridge(random_state=42),
            'Lasso': Lasso(random_state=42, max_iter=7000),
            'ElasticNet': ElasticNet(random_state=42, max_iter=7000),
            'Decision Tree': DecisionTreeRegressor(random_state=42),
            'Random Forest': RandomForestRegressor(random_state=42, n_jobs=-1),
            'Gradient Boosting': GradientBoostingRegressor(random_state=42),
            'K-Nearest Neighbors': KNeighborsRegressor(n_jobs=-1)
        }
param_grids = {
            'Ridge': {'alpha': [1.0, 10.0, 100.0, 1000.0]},
            'Lasso': {'alpha': [0.1, 1.0, 10.0, 100.0]},
            'ElasticNet': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.5, 0.75, 0.9]},
            'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20, None]},
            'Gradient Boosting': {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]},
            'K-Nearest Neighbors': {'n_neighbors': [5, 7, 9]}
        }



In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
# Model training and evaluation loop
results_list = []
best_model_overall = None
best_r2_overall = -np.inf


for name, model_instance in models.items():
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model_instance)
    ])

    current_params_for_tuning = {}
    if name in param_grids:
        current_params_for_tuning = {f'regressor__{k}': v for k, v in param_grids[name].items()}
        search = GridSearchCV(full_pipeline, current_params_for_tuning, cv=5, scoring='r2', n_jobs=-1, error_score='raise')
        try:
            search.fit(X_train, y_train)
            final_model_to_evaluate = search.best_estimator_
            best_params_found = search.best_params_
        except Exception as e:
            print(f"Error during GridSearchCV for {name}: {e}. Fitting with default params.")
            try:
                full_pipeline.fit(X_train, y_train)
                final_model_to_evaluate = full_pipeline
                best_params_found = "default (GridSearchCV failed)"
            except Exception as e_fit:
                print(f"Error fitting {name} with defaults: {e_fit}")
                continue
    else:
        try:
            full_pipeline.fit(X_train, y_train)
            final_model_to_evaluate = full_pipeline
            best_params_found = "default (no grid search)"
        except Exception as e:
            print(f"Error fitting {name}: {e}")
            continue

    try:
        y_pred = final_model_to_evaluate.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        results_list.append({'Model': name, 'R2 Score': r2, 'RMSE': rmse, 'Best Parameters': best_params_found})
        print(f"{name} - R2: {r2:.4f}, RMSE: {rmse:.2f}")

        if r2 > best_r2_overall:
            best_r2_overall = r2
            best_model_overall = final_model_to_evaluate
    except Exception as e_pred:
        print(f"Error during prediction/scoring for {name}: {e_pred}")
        results_list.append({'Model': name, 'R2 Score': np.nan, 'RMSE': np.nan, 'Best Parameters': 'Error in prediction/scoring'})

# Display results
print("\n\n--- Overall Model Evaluation Results ---")
if results_list:
    results_df = pd.DataFrame(results_list).sort_values(by='R2 Score', ascending=False)
    print(results_df.to_string())

    if best_model_overall:
        best_model_name = results_df.iloc[0]['Model']
        best_model_r2 = results_df.iloc[0]['R2 Score']
        best_model_rmse = results_df.iloc[0]['RMSE']
        best_model_params = results_df.iloc[0]['Best Parameters']
        print(f"\n--- 🎉 Best Performing Model (based on R2 Score) ---")
        print(f"Model: {best_model_name}")
        print(f"R2 Score: {best_model_r2:.4f}")
        print(f"RMSE: {best_model_rmse:.2f}")
        print(f"Parameters: {best_model_params}")

        # Example prediction with the best model (if X_test is not empty)
        if not X_test.empty:
            sample_prediction = best_model_overall.predict(X_test.head(1))
            print(f"\nSample prediction with {best_model_name} on first test sample: {sample_prediction[0]:.2f}")
            print(f"Actual value for this sample: {y_test.iloc[0]:.2f}")
    else:
        print("\nNo model was successfully selected as the best.")
else:
    print("No models were successfully trained and evaluated.")

print("\n--- Script Finished ---")

Linear Regression - R2: 0.9781, RMSE: 8419.59
Ridge - R2: 0.9788, RMSE: 8286.41
Lasso - R2: 0.9798, RMSE: 8091.29
ElasticNet - R2: 0.9788, RMSE: 8285.91
Decision Tree - R2: 0.9889, RMSE: 6000.38
Random Forest - R2: 0.9944, RMSE: 4264.48
Gradient Boosting - R2: 0.9945, RMSE: 4206.30
K-Nearest Neighbors - R2: 0.9067, RMSE: 17364.69


--- Overall Model Evaluation Results ---
                 Model  R2 Score          RMSE                                                                                Best Parameters
6    Gradient Boosting  0.994528   4206.297410  {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 200}
5        Random Forest  0.994376   4264.482563                                   {'regressor__max_depth': 20, 'regressor__n_estimators': 200}
4        Decision Tree  0.988865   6000.377372                                                                       default (no grid search)
2                Lasso  0.979752   8091.288951           

In [None]:
# save the best model
import pickle
path = f'/content/drive/MyDrive/best_model_spy_prediction_{best_model_name}.pkl'

pickle.dump(best_model_overall, open(path, 'wb'))

imported_model = pickle.load(open(path, 'rb'))
print(X_test.head(1))
imported_model.predict(X_test.head(1))

      level  attackswon  attackslost  attacksdraw  attacksassisted  \
1290    100       13072         1557          204             1033   

      defendswon  defendslost  defendsstalemated   elo  yourunaway  ...  \
1290         667         3670                 28  2685         109  ...   

      awards  nerverefills  tokenrefills  meritsbought  daysbeendonator  \
1290     519           590            33            50             1915   

      rankedwarringwins  arrestsmade  weaponsbought  dumpsearches  refills  
1290                 47            0              0             0     1708  

[1 rows x 178 columns]


array([5.32107906e+09])

In [None]:
# test
import requests, json

from google.colab import userdata
API_KEY = userdata.get('api')

def getTornData(id: str) -> dict:
    url = f"https://api.torn.com/user/{id}?selections=basic,personalstats&key={API_KEY}"
    response = requests.get(url)
    data = json.loads(response.text)
    return_data = {}
    return_data["level"] = data.get("level", 1)
    personal_stats = data.get("personalstats", {})

    for key, value in personal_stats.items():
        return_data[key] = value

    return return_data

def test():
    model = pickle.load(open(path, "rb"))
    data = getTornData("2669774")
    input_data = pd.DataFrame([data])
    prediction = model.predict(input_data)
    print(prediction)

test()

[1.78826013e+09]
