In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

## Предобработка данных
Объединим таблички в одну, удалим повторяющиеся и лишние строки по типу ```url```

In [2]:
appearances = pd.read_csv('football_data/appearances.csv')

In [3]:
appearances.columns

Index(['appearance_id', 'game_id', 'player_id', 'player_club_id',
       'player_current_club_id', 'date', 'player_name', 'competition_id',
       'yellow_cards', 'red_cards', 'goals', 'assists', 'minutes_played'],
      dtype='object')

In [4]:
competitions = pd.read_csv('football_data/competitions.csv')

In [5]:
competitions.columns

Index(['competition_id', 'competition_code', 'name', 'sub_type', 'type',
       'country_id', 'country_name', 'domestic_league_code', 'confederation',
       'url', 'is_major_national_league'],
      dtype='object')

In [6]:
competitions = competitions.drop(columns=['name','country_name','url'])

In [7]:
merged_df = pd.merge(competitions, appearances, on='competition_id',how='left')

merged_df.columns

Index(['competition_id', 'competition_code', 'sub_type', 'type', 'country_id',
       'domestic_league_code', 'confederation', 'is_major_national_league',
       'appearance_id', 'game_id', 'player_id', 'player_club_id',
       'player_current_club_id', 'date', 'player_name', 'yellow_cards',
       'red_cards', 'goals', 'assists', 'minutes_played'],
      dtype='object')

In [8]:
games = pd.read_csv('football_data/games.csv')
player_valuations = pd.read_csv('football_data/player_valuations.csv')
player = pd.read_csv('football_data/players.csv')

In [9]:
games.columns

Index(['game_id', 'competition_id', 'season', 'round', 'date', 'home_club_id',
       'away_club_id', 'home_club_goals', 'away_club_goals',
       'home_club_position', 'away_club_position', 'home_club_manager_name',
       'away_club_manager_name', 'stadium', 'attendance', 'referee', 'url',
       'home_club_formation', 'away_club_formation', 'home_club_name',
       'away_club_name', 'aggregate', 'competition_type'],
      dtype='object')

In [10]:
games = games.drop(columns=['competition_id','date','url','home_club_name','away_club_name'])

In [11]:
merged_df = pd.merge(merged_df, games, on='game_id', how='left')

merged_df.columns

Index(['competition_id', 'competition_code', 'sub_type', 'type', 'country_id',
       'domestic_league_code', 'confederation', 'is_major_national_league',
       'appearance_id', 'game_id', 'player_id', 'player_club_id',
       'player_current_club_id', 'date', 'player_name', 'yellow_cards',
       'red_cards', 'goals', 'assists', 'minutes_played', 'season', 'round',
       'home_club_id', 'away_club_id', 'home_club_goals', 'away_club_goals',
       'home_club_position', 'away_club_position', 'home_club_manager_name',
       'away_club_manager_name', 'stadium', 'attendance', 'referee',
       'home_club_formation', 'away_club_formation', 'aggregate',
       'competition_type'],
      dtype='object')

In [12]:
merged_df.dropna(inplace=True)
merged_df

Unnamed: 0,competition_id,competition_code,sub_type,type,country_id,domestic_league_code,confederation,is_major_national_league,appearance_id,game_id,...,away_club_position,home_club_manager_name,away_club_manager_name,stadium,attendance,referee,home_club_formation,away_club_formation,aggregate,competition_type
30358,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_102043,2329280,...,2.0,Peter Sörensen,Glen Riddersholm,Ceres Park,9364.0,Anders Poulsen,4-1-4-1,4-1-4-1,0:2,domestic_league
30359,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_102045,2329280,...,2.0,Peter Sörensen,Glen Riddersholm,Ceres Park,9364.0,Anders Poulsen,4-1-4-1,4-1-4-1,0:2,domestic_league
30360,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_166460,2329280,...,2.0,Peter Sörensen,Glen Riddersholm,Ceres Park,9364.0,Anders Poulsen,4-1-4-1,4-1-4-1,0:2,domestic_league
30361,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_174001,2329280,...,2.0,Peter Sörensen,Glen Riddersholm,Ceres Park,9364.0,Anders Poulsen,4-1-4-1,4-1-4-1,0:2,domestic_league
30362,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_186557,2329280,...,2.0,Peter Sörensen,Glen Riddersholm,Ceres Park,9364.0,Anders Poulsen,4-1-4-1,4-1-4-1,0:2,domestic_league
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553539,BE1,jupiler-pro-league,first_tier,domestic_league,19,BE1,europa,False,4098850_820474,4098850,...,6.0,Rik De Mil,Wouter Vrancken,Het Kuipje,8000.0,Lawrence Visser,4-2-3-1,4-2-3-1,1:1,domestic_league
1553540,BE1,jupiler-pro-league,first_tier,domestic_league,19,BE1,europa,False,4098850_840149,4098850,...,6.0,Rik De Mil,Wouter Vrancken,Het Kuipje,8000.0,Lawrence Visser,4-2-3-1,4-2-3-1,1:1,domestic_league
1553541,BE1,jupiler-pro-league,first_tier,domestic_league,19,BE1,europa,False,4098850_855731,4098850,...,6.0,Rik De Mil,Wouter Vrancken,Het Kuipje,8000.0,Lawrence Visser,4-2-3-1,4-2-3-1,1:1,domestic_league
1553542,BE1,jupiler-pro-league,first_tier,domestic_league,19,BE1,europa,False,4098850_909737,4098850,...,6.0,Rik De Mil,Wouter Vrancken,Het Kuipje,8000.0,Lawrence Visser,4-2-3-1,4-2-3-1,1:1,domestic_league


In [13]:
merged_df = merged_df.head(200000) # иначе данных будет неразумное число, кода будем присоединять игроков

In [14]:
players = pd.merge(player_valuations, player, on='player_id')

players

Unnamed: 0,player_id,date,market_value_in_eur_x,current_club_id_x,player_club_domestic_competition_id,first_name,last_name,name,last_season,current_club_id_y,...,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur_y,highest_market_value_in_eur
0,405973,2000-01-20,150000,3057,BE1,Fadel,Gobitaka,Fadel Gobitaka,2017,3057,...,,181.0,2023-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/fadel-gobitaka...,BE1,Royal Standard Club de Liège,50000.0,250000.0
1,405973,2016-02-07,250000,3057,BE1,Fadel,Gobitaka,Fadel Gobitaka,2017,3057,...,,181.0,2023-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/fadel-gobitaka...,BE1,Royal Standard Club de Liège,50000.0,250000.0
2,405973,2016-07-15,250000,3057,BE1,Fadel,Gobitaka,Fadel Gobitaka,2017,3057,...,,181.0,2023-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/fadel-gobitaka...,BE1,Royal Standard Club de Liège,50000.0,250000.0
3,405973,2017-01-01,150000,3057,BE1,Fadel,Gobitaka,Fadel Gobitaka,2017,3057,...,,181.0,2023-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/fadel-gobitaka...,BE1,Royal Standard Club de Liège,50000.0,250000.0
4,405973,2018-09-11,75000,3057,BE1,Fadel,Gobitaka,Fadel Gobitaka,2017,3057,...,,181.0,2023-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/fadel-gobitaka...,BE1,Royal Standard Club de Liège,50000.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470506,935554,2024-05-31,50000,114,TR1,Abdülmecid,Dönmez,Abdülmecid Dönmez,2023,114,...,,,2026-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/abdulmecid-don...,TR1,Beşiktaş Jimnastik Kulübü,50000.0,50000.0
470507,954135,2024-05-31,50000,3205,TR1,Berat,Eskin,Berat Eskin,2023,3205,...,right,176.0,2028-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/berat-eskin/pr...,TR1,Kayserispor Kulübü,50000.0,50000.0
470508,1047507,2024-05-31,50000,924,TR1,Emir Mustafa,Vuruşaner,Emir Mustafa Vuruşaner,2023,924,...,left,,,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/emir-mustafa-v...,TR1,İstanbulspor A.Ş.,50000.0,50000.0
470509,1055234,2024-05-31,25000,724,NL1,Mika,van der Horst,Mika van der Horst,2023,724,...,,,2027-06-30 00:00:00,Iconics Management,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/mika-van-der-h...,NL1,Football Club Volendam,25000.0,25000.0


In [15]:
players = players.drop(columns=['url'])

In [16]:
players = players.drop(columns=['image_url'])

In [17]:
players = players.drop(columns=['current_club_name'])

In [18]:
players = players.drop(columns=['date'])

In [19]:
players = players.drop(columns=['first_name','last_name','name'])

In [20]:
players = players.drop(columns=['current_club_id_x'])

In [21]:
players = players.drop(columns=['market_value_in_eur_y','market_value_in_eur_x'])

In [22]:
players = players.rename(columns={'current_club_id_y': 'current_club_id'})

In [23]:
players.columns

Index(['player_id', 'player_club_domestic_competition_id', 'last_season',
       'current_club_id', 'player_code', 'country_of_birth', 'city_of_birth',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'position',
       'foot', 'height_in_cm', 'contract_expiration_date', 'agent_name',
       'current_club_domestic_competition_id', 'highest_market_value_in_eur'],
      dtype='object')

In [24]:
dff = pd.merge(merged_df, players, on='player_id', how='left')
dff.columns

Index(['competition_id', 'competition_code', 'sub_type', 'type', 'country_id',
       'domestic_league_code', 'confederation', 'is_major_national_league',
       'appearance_id', 'game_id', 'player_id', 'player_club_id',
       'player_current_club_id', 'date', 'player_name', 'yellow_cards',
       'red_cards', 'goals', 'assists', 'minutes_played', 'season', 'round',
       'home_club_id', 'away_club_id', 'home_club_goals', 'away_club_goals',
       'home_club_position', 'away_club_position', 'home_club_manager_name',
       'away_club_manager_name', 'stadium', 'attendance', 'referee',
       'home_club_formation', 'away_club_formation', 'aggregate',
       'competition_type', 'player_club_domestic_competition_id',
       'last_season', 'current_club_id', 'player_code', 'country_of_birth',
       'city_of_birth', 'country_of_citizenship', 'date_of_birth',
       'sub_position', 'position', 'foot', 'height_in_cm',
       'contract_expiration_date', 'agent_name',
       'current_club_dom

In [25]:
df = dff.drop_duplicates()

In [26]:
df

Unnamed: 0,competition_id,competition_code,sub_type,type,country_id,domestic_league_code,confederation,is_major_national_league,appearance_id,game_id,...,country_of_citizenship,date_of_birth,sub_position,position,foot,height_in_cm,contract_expiration_date,agent_name,current_club_domestic_competition_id,highest_market_value_in_eur
0,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_102043,2329280,...,Nigeria,1990-09-08,Centre-Forward,Attack,right,174.0,,Elite Consulting,RU1,4500000.0
29,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_102045,2329280,...,Nigeria,1990-04-11,Defensive Midfield,Midfield,right,171.0,2024-06-30 00:00:00,,DK1,1000000.0
51,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_166460,2329280,...,Iraq,1993-07-14,Centre-Back,Defender,right,181.0,2025-05-31 00:00:00,KickGlobal,DK1,400000.0
71,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_174001,2329280,...,Georgia,1993-03-16,Second Striker,Attack,right,187.0,2023-12-31 00:00:00,FA PLAYERS AGENTS,DK1,600000.0
97,DK1,superligaen,first_tier,domestic_league,39,DK1,europa,False,2329280_186557,2329280,...,Denmark,1994-04-04,Centre-Back,Defender,right,196.0,2023-06-30 00:00:00,,DK1,800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5347270,FR1,ligue-1,first_tier,domestic_league,50,FR1,europa,True,2942773_18955,2942773,...,France,1984-09-29,Central Midfield,Midfield,right,181.0,,,FR1,4000000.0
5347302,FR1,ligue-1,first_tier,domestic_league,50,FR1,europa,True,2942773_202318,2942773,...,Algeria,1993-02-25,Attacking Midfield,Midfield,right,180.0,2024-06-30 00:00:00,,FR1,5500000.0
5347329,FR1,ligue-1,first_tier,domestic_league,50,FR1,europa,True,2942773_222859,2942773,...,France,1991-11-19,Attacking Midfield,Midfield,right,175.0,2025-06-30 00:00:00,,FR1,9000000.0
5347356,FR1,ligue-1,first_tier,domestic_league,50,FR1,europa,True,2942773_291200,2942773,...,Senegal,1996-03-08,Centre-Back,Defender,left,190.0,2025-06-30 00:00:00,CAA Stellar,GB1,18000000.0


In [27]:
df['type'].unique()

array(['domestic_league'], dtype=object)

In [28]:
df['foot'].unique()

array(['right', 'left', 'both', nan], dtype=object)

In [29]:
df['confederation'].unique()

array(['europa'], dtype=object)

In [30]:
df['home_club_formation'].unique()

array(['4-1-4-1', '4-2-3-1', '4-4-1-1', '4-3-3', '4-2-4', '4-4-2',
       '3-4-3', '4-4-2 double 6', '5-4-1', '4-3-3 Attacking',
       '4-3-3 Defending', '3-4-3 Diamond', '3-5-2', '3-5-2 flat',
       '4-4-2 Diamond', '5-4-1 Diamond', '3-4-2-1', '4-3-1-2',
       '4-5-1 flat', '3-4-1-2', '4-5-1', '5-3-2', '3-1-4-2', '4-3-2-1',
       '3-3-3-1', '4-1-3-2', '3-5-2 Attacking'], dtype=object)

In [31]:
df['competition_id'].unique()

array(['DK1', 'ES1', 'FR1'], dtype=object)

In [32]:
df['competition_code'].unique()

array(['superligaen', 'laliga', 'ligue-1'], dtype=object)

In [33]:
df['round'].unique()

array(['1. Matchday', '2. Matchday', '3. Matchday', '4. Matchday',
       '5. Matchday', '6. Matchday', '7. Matchday', '8. Matchday',
       '9. Matchday', '10. Matchday', '11. Matchday', '12. Matchday',
       '13. Matchday', '14. Matchday', '15. Matchday', '16. Matchday',
       '17. Matchday', '18. Matchday', '19. Matchday', '20. Matchday',
       '21. Matchday', '22. Matchday', '23. Matchday', '24. Matchday',
       '25. Matchday', '26. Matchday', '27. Matchday', '28. Matchday',
       '29. Matchday', '30. Matchday', '31. Matchday', '32. Matchday',
       '33. Matchday', '34. Matchday', '35. Matchday', '36. Matchday',
       '37. Matchday', '38. Matchday'], dtype=object)

In [34]:
df = df.drop(columns=['stadium'])

In [35]:
df = df.drop(columns=['agent_name'])

In [36]:
df = df.drop(columns=['confederation'])

In [37]:
df = df.drop(columns=['attendance'])

In [38]:
df = df.drop(columns=['is_major_national_league'])

In [39]:
df = df.drop(columns=['yellow_cards','red_cards'])

In [40]:
df = df.drop(columns=['city_of_birth'])

In [41]:
df = df.drop(columns=['appearance_id'])

In [42]:
df = df.drop(columns=['aggregate'])

In [43]:
df = df.drop(columns=['type'])

In [44]:
df = df.drop(columns=['competition_code'])

In [45]:
df = df.drop(columns=['country_of_citizenship'])

***Применим one-hot-encoding к колонкам, где немного уникальных значений***

In [46]:
df = pd.get_dummies(df, columns=['competition_type','foot','sub_type', 'position', 'sub_position', 'competition_id'])

In [47]:
df.columns

Index(['country_id', 'domestic_league_code', 'game_id', 'player_id',
       'player_club_id', 'player_current_club_id', 'date', 'player_name',
       'goals', 'assists', 'minutes_played', 'season', 'round', 'home_club_id',
       'away_club_id', 'home_club_goals', 'away_club_goals',
       'home_club_position', 'away_club_position', 'home_club_manager_name',
       'away_club_manager_name', 'referee', 'home_club_formation',
       'away_club_formation', 'player_club_domestic_competition_id',
       'last_season', 'current_club_id', 'player_code', 'country_of_birth',
       'date_of_birth', 'height_in_cm', 'contract_expiration_date',
       'current_club_domestic_competition_id', 'highest_market_value_in_eur',
       'competition_type_domestic_league', 'foot_both', 'foot_left',
       'foot_right', 'sub_type_first_tier', 'position_Attack',
       'position_Defender', 'position_Goalkeeper', 'position_Midfield',
       'position_Missing', 'sub_position_Attacking Midfield',
       'sub_pos

In [48]:
types = df.dtypes

types[types == 'object']

domestic_league_code                    object
date                                    object
player_name                             object
round                                   object
home_club_manager_name                  object
away_club_manager_name                  object
referee                                 object
home_club_formation                     object
away_club_formation                     object
player_club_domestic_competition_id     object
player_code                             object
country_of_birth                        object
date_of_birth                           object
contract_expiration_date                object
current_club_domestic_competition_id    object
dtype: object

In [49]:
text_columns = ['domestic_league_code',
 'date',
 'player_name',
 'round',
 'home_club_manager_name',
 'away_club_manager_name',
 'referee',
 'home_club_formation',
 'away_club_formation',
 'player_club_domestic_competition_id',
 'player_code',
 'country_of_birth',
 'date_of_birth',
 'contract_expiration_date',
 'current_club_domestic_competition_id']

In [50]:
df[text_columns] = df[text_columns].fillna('')
df

Unnamed: 0,country_id,domestic_league_code,game_id,player_id,player_club_id,player_current_club_id,date,player_name,goals,assists,...,sub_position_Left Midfield,sub_position_Left Winger,sub_position_Left-Back,sub_position_Right Midfield,sub_position_Right Winger,sub_position_Right-Back,sub_position_Second Striker,competition_id_DK1,competition_id_ES1,competition_id_FR1
0,39,DK1,2329280,102043,865,49702,2013-07-19,Sly,0,0,...,False,False,False,False,False,False,False,True,False,False
29,39,DK1,2329280,102045,865,173,2013-07-19,Izunna Uzochukwu,0,0,...,False,False,False,False,False,False,False,True,False,False
51,39,DK1,2329280,166460,678,1063,2013-07-19,Frans Putros,0,0,...,False,False,False,False,False,False,False,True,False,False
71,39,DK1,2329280,174001,678,1177,2013-07-19,Davit Skhirtladze,0,0,...,False,False,False,False,False,False,True,True,False,False
97,39,DK1,2329280,186557,865,5817,2013-07-19,Patrick Banggaard,0,0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5347270,50,FR1,2942773,18955,347,347,2018-01-27,Renaud Cohade,0,0,...,False,False,False,False,False,False,False,False,False,True
5347302,50,FR1,2942773,202318,347,347,2018-01-27,Farid Boulaya,0,0,...,False,False,False,False,False,False,False,False,False,True
5347329,50,FR1,2942773,222859,347,995,2018-01-27,Florent Mollet,0,1,...,False,False,False,False,False,False,False,False,False,True
5347356,50,FR1,2942773,291200,347,703,2018-01-27,Moussa Niakhaté,0,0,...,False,False,False,False,False,False,False,False,False,True


In [51]:
columns_with_nan = df.isna().any()

print("Колонки с пропущенными значениями:")
print(columns_with_nan[columns_with_nan].index)


Колонки с пропущенными значениями:
Index(['last_season', 'current_club_id', 'height_in_cm',
       'highest_market_value_in_eur'],
      dtype='object')


In [52]:
df = df.drop(columns=['home_club_position', 'away_club_position', 'last_season',
       'current_club_id', 'height_in_cm', 'highest_market_value_in_eur'])

In [53]:
columns_to_drop = ['home_club_position', 'away_club_position', 'last_season',
       'current_club_id', 'height_in_cm', 'highest_market_value_in_eur']
for column in columns_to_drop:
    if column in text_columns:
        text_columns.remove(column)

text_columns

['domestic_league_code',
 'date',
 'player_name',
 'round',
 'home_club_manager_name',
 'away_club_manager_name',
 'referee',
 'home_club_formation',
 'away_club_formation',
 'player_club_domestic_competition_id',
 'player_code',
 'country_of_birth',
 'date_of_birth',
 'contract_expiration_date',
 'current_club_domestic_competition_id']

In [54]:
import re

In [55]:
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub('[().,?!-:]', '', text)
    text = text.replace('"', '')
    return text

In [56]:
df[text_columns] = df[text_columns].applymap(preprocess_text)

In [57]:
df.dropna(inplace=True)

In [58]:
cols = df.columns
mas = list(set(cols)-set(text_columns))

In [59]:
df['combined_text'] = df[text_columns].apply(lambda row: ' '.join(row), axis=1)

In [60]:
cols = text_columns
cols.append('goals')
cols.append('combined_text')

Обучимся только на числовых признаках

In [61]:
dff = df.head(20000)

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

target_column = 'goals'  

X = dff.drop(columns=cols)
y = dff[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16000, 38), (4000, 38), (16000,), (4000,))

Подбираем гиперпараметры и смотрим разные модели

In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

models = [
    {
        'name': 'Logistic Regression',
        'estimator': LogisticRegression(),
        'hyperparameters': {
            'C': [0.1, 1, 10, 100],
            'solver': ['liblinear', 'saga']
        }
    },
    {
        'name': 'Random Forest',
        'estimator': RandomForestClassifier(),
        'hyperparameters': {
            'n_estimators': [10, 50, 100, 200],
            'max_depth': [None, 5, 10, 20]
        }
    }
]

for model in models:
    print("Training")
    grid = GridSearchCV(model['estimator'], model['hyperparameters'], cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print("лучшие параметры:", grid.best_params_)
    print( grid.best_score_)
    y_pred = best_model.predict(X_test)
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("="*40)


Training
лучшие параметры: {'C': 0.1, 'solver': 'liblinear'}
0.9155
MAE: 0.084
MSE: 0.1105
Training
лучшие параметры: {'max_depth': 10, 'n_estimators': 50}
0.9155000000000001
MAE: 0.08325
MSE: 0.10875


In [70]:
X = dff['combined_text']
y = dff[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bow = TfidfVectorizer()

X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)


In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor())
]

for name, model in models:
    model.fit(X_train_bow, y_train)
    
    y_pred = model.predict(X_test_bow)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Модель: {name}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print()


Модель: Linear Regression
MAE: 0.15508147292549945
MSE: 0.0979695619754165

Модель: Random Forest
MAE: 0.14640182656926407
MSE: 0.1188513301856495



In [74]:
Лучше всего себя показала Random Forest на числовых признаках
MAE: 0.08325
MSE: 0.10875

SyntaxError: invalid syntax (2680732901.py, line 1)