In [None]:
from google.colab import drive



drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
import re
import numpy as np
import pandas as pd
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer
from scipy.stats import randint, uniform


In [None]:
file_train = '/content/drive/MyDrive/TRAIN.csv'

players, races, actions_list = [], [], []
with open(file_train, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row)<2:
            continue
        players.append(row[0])
        races.append(row[1])
        actions_list.append(row[2:])

df = pd.DataFrame({"player":players, "race":races, "actions":actions_list})



FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/TRAIN.csv'

In [None]:
def remove_time_markers(seq):
    return [x for x in seq if not re.match(r"^t\d+$", x)]

df["actions_clean"] = df["actions"].apply(remove_time_markers)

def get_last_time(seq):
    tmax = 0
    for x in seq:
        m = re.match(r"^t(\d+)$", x)
        if m:
            val = int(m.group(1))
            if val>tmax:
                tmax = val
    return tmax

df["last_time"] = df["actions"].apply(get_last_time)

df["action_counter"] = df["actions_clean"].apply(Counter)


In [None]:
all_actions = Counter()
for c in df["action_counter"]:
    all_actions.update(c.keys())

MIN_REPLAYS = 5
action_presence_count = Counter()
for a in all_actions:
    presence = df["action_counter"].apply(lambda x: a in x).sum()
    action_presence_count[a] = presence

filtered_actions = [a for (a, cnt) in action_presence_count.items() if cnt>=MIN_REPLAYS]

def to_freq_vector(counter_obj, actions_kept):
    total_count = sum(counter_obj.values())
    if total_count==0:
        return [0]*len(actions_kept)
    else:
        return [counter_obj.get(a,0)/total_count for a in actions_kept]

X_list = []
for c in df["action_counter"]:
    X_list.append(to_freq_vector(c, filtered_actions))
X_bag = pd.DataFrame(X_list, columns=filtered_actions)


In [None]:
df["total_actions"] = df["action_counter"].apply(lambda cnt: sum(cnt.values()))
df["distinct_actions"] = df["action_counter"].apply(lambda cnt: len(cnt))
df["ratio_distinct"] = df.apply(lambda row: row["distinct_actions"]/row["total_actions"]
                                if row["total_actions"]>0 else 0, axis=1)

def entropy_actions(c):
    tot = sum(c.values())
    if tot==0: return 0
    ent = 0
    for a in c:
        p = c[a]/tot
        ent -= p*np.log2(p)
    return ent
df["entropy"] = df["action_counter"].apply(entropy_actions)

def compute_apm(row):
    if row["last_time"]>0:
        return row["total_actions"]/ (row["last_time"]/60)
    else:
        return 0
df["apm"] = df.apply(compute_apm, axis=1)

X_misc = df[["total_actions","distinct_actions","ratio_distinct","entropy","apm"]]

race_dummies = pd.get_dummies(df["race"], prefix="race")

X = pd.concat([X_bag, X_misc, race_dummies], axis=1)

players_unique = df["player"].unique()
player_to_id = {p:i for i,p in enumerate(players_unique)}
id_to_player = {v:k for k,v in player_to_id.items()}
y = df["player"].map(player_to_id)



In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:

param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(10, 50),
    "min_samples_leaf": randint(1, 5),
    "max_features": ["sqrt", "log2", 0.3, 0.5]
}

rf_model = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring="accuracy",
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, y)

print("Meilleurs paramètres trouvés :", random_search.best_params_)
print("Meilleur score CV :", random_search.best_score_)

best_rf = random_search.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits




Meilleurs paramètres trouvés : {'max_depth': 44, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 213}
Meilleur score CV : 0.8997316948834214


In [None]:
mappings = {
    "filtered_actions": filtered_actions,
    "player_to_id": player_to_id,
    "id_to_player": id_to_player,
    "X_columns": X.columns
}

with open("model_rf.pkl","wb") as f:
    pickle.dump(best_rf, f)

with open("mappings.pkl","wb") as f:
    pickle.dump(mappings, f)

print("Modèle final sauvegardé. Score CV ~", random_search.best_score_)

Modèle final sauvegardé. Score CV ~ 0.8997316948834214


In [None]:
import csv
import re
import numpy as np
import pandas as pd
from collections import Counter
import pickle

def remove_time_markers(seq):
    return [x for x in seq if not re.match(r"^t\d+$", x)]

def get_last_time(seq):
    tmax = 0
    for x in seq:
        m = re.match(r"^t(\d+)$", x)
        if m:
            val = int(m.group(1))
            if val>tmax:
                tmax = val
    return tmax

def to_freq_vector(counter_obj, actions_kept):
    total = sum(counter_obj.values())
    if total==0:
        return [0]*len(actions_kept)
    else:
        return [counter_obj.get(a,0)/total for a in actions_kept]

def entropy_actions(c):
    tot = sum(c.values())
    if tot==0: return 0
    ent = 0
    for a in c:
        p = c[a]/tot
        ent -= p*np.log2(p)
    return ent

def compute_apm(total_actions, last_time):
    if last_time>0:
        return total_actions/(last_time/60)
    else:
        return 0


with open("model_rf.pkl","rb") as f:
    best_rf = pickle.load(f)
with open("mappings.pkl","rb") as f:
    saved = pickle.load(f)

filtered_actions = saved["filtered_actions"]
id_to_player = saved["id_to_player"]
X_columns = saved["X_columns"]

file_test = '/content/drive/MyDrive/TEST.CSV'
races_test, actions_test = [], []
with open(file_test,"r",encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row)<1:
            continue
        race = row[0]
        actions = row[1:]
        races_test.append(race)
        actions_test.append(actions)

df_test = pd.DataFrame({"race":races_test, "actions":actions_test})

df_test["actions_clean"] = df_test["actions"].apply(remove_time_markers)
df_test["last_time"] = df_test["actions"].apply(get_last_time)
df_test["action_counter"] = df_test["actions_clean"].apply(Counter)

X_list_test = []
for c in df_test["action_counter"]:
    X_list_test.append(to_freq_vector(c, filtered_actions))
X_test_bag = pd.DataFrame(X_list_test, columns=filtered_actions)

df_test["total_actions"] = df_test["action_counter"].apply(lambda c: sum(c.values()))
df_test["distinct_actions"] = df_test["action_counter"].apply(lambda c: len(c))
df_test["ratio_distinct"] = df_test.apply(
    lambda row: row["distinct_actions"]/row["total_actions"] if row["total_actions"]>0 else 0, axis=1)
df_test["entropy"] = df_test["action_counter"].apply(entropy_actions)
df_test["apm"] = df_test.apply(lambda row: compute_apm(row["total_actions"], row["last_time"]), axis=1)

X_test_misc = df_test[["total_actions","distinct_actions","ratio_distinct","entropy","apm"]]
race_dummies_test = pd.get_dummies(df_test["race"], prefix="race")

X_test_full = pd.concat([X_test_bag, X_test_misc, race_dummies_test], axis=1)
X_test_full = X_test_full.reindex(columns=X_columns, fill_value=0)

pred_ids = best_rf.predict(X_test_full)
pred_players = [id_to_player[i] for i in pred_ids]

submission = pd.DataFrame({
    "RowId": range(1, len(pred_players)+1),
    "prediction": pred_players
})
submission.to_csv("submission8_1.csv", index=False)
print("Submission ready:", len(submission), "lines.")


Submission ready: 340 lines.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

path = '/content/drive/MyDrive/machine_learning_pro/instarcraft/TRAIN.CSV'

# Charger les données (seulement Joueur et Race)
data2 = pd.read_csv(path, header=None, usecols=[0, 1], engine='python', on_bad_lines='skip')
data2.columns = ['Joueur', 'Race']

# Compter le nombre de matchs par joueur
match_counts = data2['Joueur'].value_counts()

# Définir un seuil pour les joueurs ayant un faible nombre de matchs
seuil_min_matchs = 5  # Joueurs avec moins de 5 matchs sont considérés comme rares

# Séparer les joueurs en deux groupes : ceux avec peu de matchs et ceux avec un nombre suffisant
joueurs_rares = match_counts[match_counts < seuil_min_matchs].index
joueurs_normaux = match_counts[match_counts >= seuil_min_matchs].index

# Séparer les données en deux groupes selon les joueurs rares et normaux
data_rares = data2[data2['Joueur'].isin(joueurs_rares)]
data_normaux = data2[data2['Joueur'].isin(joueurs_normaux)]

# 🔹 Afficher les joueurs ayant moins de `seuil_min_matchs` matchs
print("\nJoueurs ayant moins de", seuil_min_matchs, "matchs :")
print(data_rares.groupby("Joueur").size().reset_index(name="Nombre de matchs"))

# Split des joueurs normaux en train et validation de manière classique
train_normaux, val_normaux = train_test_split(data_normaux, test_size=0.2, random_state=42, stratify=data_normaux['Race'])

# Split des joueurs rares pour s'assurer qu'ils sont bien répartis entre train et validation
train_rares, val_rares = train_test_split(data_rares, test_size=0.5, random_state=42)

# Combiner les ensembles pour obtenir les splits finaux
train_final = pd.concat([train_normaux, train_rares])
val_final = pd.concat([val_normaux, val_rares])

# Vérification des proportions
print("\nNombre de joueurs uniques dans Train :", train_final['Joueur'].nunique())
print("Nombre de joueurs uniques dans Validation :", val_final['Joueur'].nunique())



Joueurs ayant moins de 5 matchs :
                                               Joueur  Nombre de matchs
0   http://eu.battle.net/sc2/en/profile/3434150/1/...                 4
1   http://eu.battle.net/sc2/en/profile/4860568/1/...                 4
2   http://kr.battle.net/sc2/en/profile/2340350/1/...                 4
3   http://kr.battle.net/sc2/en/profile/2342299/1/...                 4
4   http://kr.battle.net/sc2/en/profile/2342789/1/...                 4
5   http://kr.battle.net/sc2/en/profile/2343183/1/...                 4
6   http://kr.battle.net/sc2/en/profile/2343910/1/...                 4
7   http://kr.battle.net/sc2/en/profile/2344602/1/...                 4
8   http://kr.battle.net/sc2/en/profile/2344728/1/...                 4
9   http://kr.battle.net/sc2/en/profile/2345102/1/...                 4
10  http://kr.battle.net/sc2/en/profile/3488858/1/...                 4
11  http://kr.battle.net/sc2/en/profile/4324250/1/...                 4
12  http://kr.battle.net/sc2/