In [None]:
import csv
import re
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [None]:
from google.colab import drive


# Monter Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_train = '/content/drive/MyDrive/TRAIN.csv'

players, races, actions_list = [], [], []
with open(file_train, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) < 2:
            continue
        players.append(row[0])
        races.append(row[1])
        actions_list.append(row[2:])

df = pd.DataFrame({"player": players, "race": races, "actions": actions_list})


In [None]:
def remove_time_markers(seq):
    return [x for x in seq if not re.match(r"^t\d+$", x)]

df["actions_clean"] = df["actions"].apply(remove_time_markers)

# -----------------------
# C) Construire un Counter d'actions (par replay)
# -----------------------
df["action_counter"] = df["actions_clean"].apply(Counter)


In [None]:
all_actions = Counter()
for c in df["action_counter"]:
    all_actions.update(c.keys())

MIN_REPLAYS = 5  # par exemple
action_presence_count = Counter()
for a in all_actions:
    presence = df["action_counter"].apply(lambda x: a in x).sum()
    action_presence_count[a] = presence

filtered_actions = [a for (a, cnt) in action_presence_count.items() if cnt >= MIN_REPLAYS]
print("Nombre total d'actions :", len(all_actions))
print("Nombre d'actions retenues :", len(filtered_actions))

Nombre total d'actions : 33
Nombre d'actions retenues : 33


In [None]:
def to_frequency_vector(counter_obj, actions_kept):
    total_count = sum(counter_obj.values())
    if total_count == 0:
        return [0]*len(actions_kept)
    else:
        return [counter_obj.get(a, 0)/total_count for a in actions_kept]

X_list = []
for c in df["action_counter"]:
    X_list.append(to_frequency_vector(c, filtered_actions))

X = pd.DataFrame(X_list, columns=filtered_actions)


In [None]:
race_dummies = pd.get_dummies(df["race"], prefix="race")
X = pd.concat([X, race_dummies], axis=1)

# -----------------------
# G) Encodage de la cible (player -> ID)
# -----------------------
players_unique = df["player"].unique()
player_to_id = {p: i for i, p in enumerate(players_unique)}
id_to_player = {v: k for k, v in player_to_id.items()}
y = df["player"].map(player_to_id)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)



In [None]:
# -----------------------
# J) Évaluer sur la validation
# -----------------------
y_pred = rf.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print("Accuracy en validation :", acc)


Accuracy en validation : 0.8985270049099836


In [None]:
X_columns = X.columns  # Pour réindexer en test
mappings = {
    "player_to_id": player_to_id,
    "id_to_player": id_to_player,
    "filtered_actions": filtered_actions,
    "X_columns": X_columns
}

import pickle
with open("model_rf.pkl", "wb") as f:
    pickle.dump(rf, f)

with open("mappings.pkl", "wb") as f:
    pickle.dump(mappings, f)

print("Modèle et mappings sauvegardés.")

Modèle et mappings sauvegardés.


In [None]:
import csv
import re
import pandas as pd
from collections import Counter
import pickle

# -----------------------
# A) Charger le modèle et les mappings
# -----------------------
with open("model_rf.pkl", "rb") as f:
    rf = pickle.load(f)

with open("mappings.pkl", "rb") as f:
    saved = pickle.load(f)

player_to_id = saved["player_to_id"]
id_to_player = saved["id_to_player"]
filtered_actions = saved["filtered_actions"]
X_columns = saved["X_columns"]

# -----------------------
# B) Lecture du Test.csv
# -----------------------

file_test = '/content/drive/MyDrive/TEST.CSV'

races_test, actions_test = [], []
with open(file_test, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        if not row or all(not cell.strip() for cell in row):
            continue
        race = row[0]
        actions = row[1:]
        races_test.append(race)
        actions_test.append(actions)

df_test = pd.DataFrame({"race": races_test, "actions": actions_test})
print("Test shape :", df_test.shape)

def remove_time_markers(seq):
    return [x for x in seq if not re.match(r"^t\d+$", x)]

df_test["actions_clean"] = df_test["actions"].apply(remove_time_markers)
df_test["action_counter"] = df_test["actions_clean"].apply(Counter)

# -----------------------
# C) Transformation en fréquences (mêmes filtered_actions)
# -----------------------
def to_frequency_vector(counter_obj, actions_kept):
    total_count = sum(counter_obj.values())
    if total_count == 0:
        return [0]*len(actions_kept)
    else:
        return [counter_obj.get(a, 0)/total_count for a in actions_kept]

X_list_test = []
for c in df_test["action_counter"]:
    X_list_test.append(to_frequency_vector(c, filtered_actions))

X_test_bag = pd.DataFrame(X_list_test, columns=filtered_actions)

# -----------------------
# D) One-hot race (puis concat)
# -----------------------
race_dummies_test = pd.get_dummies(df_test["race"], prefix="race")
X_test_full = pd.concat([X_test_bag, race_dummies_test], axis=1)

# -----------------------
# E) Réindexer selon X_columns
# -----------------------
X_test_full = X_test_full.reindex(columns=X_columns, fill_value=0)

print("X_test_full shape :", X_test_full.shape)

# -----------------------
# F) Prédiction
# -----------------------
pred_ids = rf.predict(X_test_full)

# -----------------------
# G) Décodage (ID -> URL du joueur)
# -----------------------
pred_players = [id_to_player[i] for i in pred_ids]

# -----------------------
# H) Génération submission.csv
# -----------------------
submission = pd.DataFrame({
    "RowId": range(1, len(pred_players) + 1),
    "prediction": pred_players
})

submission.to_csv("submission.csv", index=False)
print("Fichier submission.csv créé !")


Test shape : (340, 2)
X_test_full shape : (340, 36)
Fichier submission.csv créé !
