In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Load dataset
df = pd.read_csv("Top Expensive Leagues.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    object 
 1   League Name                  700 non-null    object 
 2   Country                      700 non-null    object 
 3   Sport                        700 non-null    object 
 4   Revenue (USD)                700 non-null    float64
 5   Average Player Salary (USD)  700 non-null    object 
 6   Top Team                     700 non-null    object 
 7   Total Teams                  700 non-null    int64  
 8   Founded Year                 700 non-null    float64
 9   Viewership                   697 non-null    float64
dtypes: float64(3), int64(1), object(6)
memory usage: 54.8+ KB


In [None]:
df.isnull().sum()

League ID                      0
League Name                    0
Country                        0
Sport                          0
Revenue (USD)                  0
Average Player Salary (USD)    0
Top Team                       0
Total Teams                    0
Founded Year                   0
Viewership                     3
dtype: int64

In [None]:
for col in df.columns:
    if df[col].dtype=='object':
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [None]:
cardinality=df.nunique()

In [None]:
cardinality

League ID                      700
League Name                      8
Country                          7
Sport                            4
Revenue (USD)                  699
Average Player Salary (USD)    541
Top Team                        27
Total Teams                     26
Founded Year                   146
Viewership                     691
dtype: int64

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        if cardinality[col] <= 5:
            df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)
        else:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    int64  
 1   League Name                  700 non-null    int64  
 2   Country                      700 non-null    int64  
 3   Revenue (USD)                700 non-null    float64
 4   Average Player Salary (USD)  700 non-null    int64  
 5   Top Team                     700 non-null    int64  
 6   Total Teams                  700 non-null    int64  
 7   Founded Year                 700 non-null    float64
 8   Viewership                   700 non-null    float64
 9   Sport_Basketball             700 non-null    int64  
 10  Sport_Cricket                700 non-null    int64  
 11  Sport_Football               700 non-null    int64  
dtypes: float64(3), int64(9)
memory usage: 65.8 KB


In [None]:
num_col=df.select_dtypes(include=['int32','float64']).columns

In [None]:
num_col

Index(['Revenue (USD)', 'Founded Year', 'Viewership'], dtype='object')

In [None]:
scaler=StandardScaler()
df[num_col]=scaler.fit_transform(df[num_col])

In [None]:
df.head()

Unnamed: 0,League ID,League Name,Country,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership,Sport_Basketball,Sport_Cricket,Sport_Football
0,0,6,0,0.461642,74,15,16,-0.512943,1.644673,0,0,1
1,1,7,3,1.298311,418,12,13,0.265422,-0.863742,0,0,0
2,2,1,0,1.686759,179,8,13,0.282529,-0.703038,0,0,0
3,3,2,5,0.322094,146,4,22,0.453598,-0.072378,0,0,0
4,4,7,4,0.918621,439,13,16,0.145674,-1.451224,1,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    int64  
 1   League Name                  700 non-null    int64  
 2   Country                      700 non-null    int64  
 3   Revenue (USD)                700 non-null    float64
 4   Average Player Salary (USD)  700 non-null    int64  
 5   Top Team                     700 non-null    int64  
 6   Total Teams                  700 non-null    int64  
 7   Founded Year                 700 non-null    float64
 8   Viewership                   700 non-null    float64
 9   Sport_Basketball             700 non-null    int64  
 10  Sport_Cricket                700 non-null    int64  
 11  Sport_Football               700 non-null    int64  
dtypes: float64(3), int64(9)
memory usage: 65.8 KB


In [None]:
x=df.drop(columns=['Viewership'])   # inputlarni tanlash

In [None]:
y=df['Viewership']

In [None]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)    # Datani qismlarga ajratish
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [None]:
model=LinearRegression()
linear_model=model.fit(x_train,y_train)
y_pred=linear_model.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [None]:
mse

0.8851021690248395

In [None]:
r2

0.031564159035777584

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 5-Fold CV

# Perform Cross-Validation
cv_scores = cross_val_score(linear_model, x, y, cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [0.95383824 1.07737359 0.98280102 0.97688835 0.96301788 0.94200463
 1.03872007 0.97683812 1.05525275 1.06760891]
Mean CV Score: 1.0034343574807436


In [None]:
print(np.mean(cv_scores)-mse)

0.1183321884559041


VAZIFA #Restart

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("Corporate Stress.csv")

  df = pd.read_csv("Corporate Stress.csv")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ID                                50000 non-null  int64  
 1   Age                               50000 non-null  int64  
 2   Gender                            50000 non-null  object 
 3   Marital_Status                    50000 non-null  object 
 4   Job_Role                          49999 non-null  object 
 5   Experience_Years                  50000 non-null  int64  
 6   Monthly_Salary_INR                50000 non-null  float64
 7   Working_Hours_per_Week            50000 non-null  int64  
 8   Commute_Time_Hours                49999 non-null  float64
 9   Remote_Work                       50000 non-null  bool   
 10  Stress_Level                      49999 non-null  float64
 11  Health_Issues                     37459 non-null  object 
 12  Comp

In [4]:
df.isnull().sum()

ID                                      0
Age                                     0
Gender                                  0
Marital_Status                          0
Job_Role                                1
Experience_Years                        0
Monthly_Salary_INR                      0
Working_Hours_per_Week                  0
Commute_Time_Hours                      1
Remote_Work                             0
Stress_Level                            1
Health_Issues                       12541
Company_Size                            0
Department                              0
Sleep_Hours                             0
Physical_Activity_Hours_per_Week        0
Mental_Health_Leave_Taken               1
Manager_Support_Level                   0
Work_Pressure_Level                     1
Annual_Leaves_Taken                     1
Work_Life_Balance                       2
Family_Support_Level                    0
Job_Satisfaction                        0
Performance_Rating                

In [5]:
df=df.drop(columns=['Health_Issues'])

In [6]:
for col in df.columns:
    if df[col].dtype=='object':
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 29 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ID                                50000 non-null  int64  
 1   Age                               50000 non-null  int64  
 2   Gender                            50000 non-null  object 
 3   Marital_Status                    50000 non-null  object 
 4   Job_Role                          50000 non-null  object 
 5   Experience_Years                  50000 non-null  int64  
 6   Monthly_Salary_INR                50000 non-null  float64
 7   Working_Hours_per_Week            50000 non-null  int64  
 8   Commute_Time_Hours                50000 non-null  float64
 9   Remote_Work                       50000 non-null  bool   
 10  Stress_Level                      50000 non-null  float64
 11  Company_Size                      50000 non-null  object 
 12  Depa

In [8]:
cardinality=df.nunique()

In [9]:
cardinality

ID                                  50000
Age                                    48
Gender                                  3
Marital_Status                          4
Job_Role                                6
Experience_Years                       41
Monthly_Salary_INR                  49927
Working_Hours_per_Week                 56
Commute_Time_Hours                    302
Remote_Work                             2
Stress_Level                           12
Company_Size                            3
Department                              6
Sleep_Hours                            51
Physical_Activity_Hours_per_Week      101
Mental_Health_Leave_Taken               6
Manager_Support_Level                  11
Work_Pressure_Level                    12
Annual_Leaves_Taken                    32
Work_Life_Balance                      12
Family_Support_Level                   11
Job_Satisfaction                       11
Performance_Rating                     12
Team_Size                         

In [10]:
#Object tipidagi ustunlarni label encoding qilamiz. Qolgan ustunlarni one-hot encoding qilamiz

for col in df.select_dtypes(include=['object']).columns:
    cardinality=df[col].nunique()
    if cardinality<=5:
        le=LabelEncoder()
        df[col]=le.fit_transform(df[col])
    else:
        df=pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ID                                50000 non-null  int64  
 1   Age                               50000 non-null  int64  
 2   Gender                            50000 non-null  int64  
 3   Marital_Status                    50000 non-null  int64  
 4   Experience_Years                  50000 non-null  int64  
 5   Monthly_Salary_INR                50000 non-null  float64
 6   Working_Hours_per_Week            50000 non-null  int64  
 7   Commute_Time_Hours                50000 non-null  float64
 8   Remote_Work                       50000 non-null  bool   
 9   Stress_Level                      50000 non-null  float64
 10  Company_Size                      50000 non-null  int64  
 11  Sleep_Hours                       50000 non-null  float64
 12  Phys

In [12]:
#Encoding qilingan ustunlardan tashqari bo'lgan 'bool' tipidagi ustunlarni 'int' ga o'tkazamiz

df = df.astype({col: 'int' for col in df.select_dtypes(include=['bool']).columns})

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ID                                50000 non-null  int64  
 1   Age                               50000 non-null  int64  
 2   Gender                            50000 non-null  int64  
 3   Marital_Status                    50000 non-null  int64  
 4   Experience_Years                  50000 non-null  int64  
 5   Monthly_Salary_INR                50000 non-null  float64
 6   Working_Hours_per_Week            50000 non-null  int64  
 7   Commute_Time_Hours                50000 non-null  float64
 8   Remote_Work                       50000 non-null  int64  
 9   Stress_Level                      50000 non-null  float64
 10  Company_Size                      50000 non-null  int64  
 11  Sleep_Hours                       50000 non-null  float64
 12  Phys

In [14]:
num_col=df.select_dtypes(include=['int32','float64']).columns

In [15]:
scaler=StandardScaler()
df[num_col]=scaler.fit_transform(df[num_col])

In [16]:
df.head()

Unnamed: 0,ID,Age,Gender,Marital_Status,Experience_Years,Monthly_Salary_INR,Working_Hours_per_Week,Commute_Time_Hours,Remote_Work,Stress_Level,...,Department_Finance,Department_HR,Department_IT,Department_Marketing,Department_Sales,Mental_Health_Leave_Taken_True,Mental_Health_Leave_Taken_0,Mental_Health_Leave_Taken_1,Mental_Health_Leave_Taken_FALSE,Mental_Health_Leave_Taken_TRUE
0,1,56,2,3,5,-0.142634,44,0.253029,1,-1.586409,...,0,0,0,1,0,0,0,0,1,0
1,2,46,0,2,20,1.065006,54,-1.014654,1,0.632495,...,0,0,1,0,0,0,0,0,0,1
2,3,32,0,2,10,-1.385168,81,-0.323191,0,0.632495,...,0,1,0,0,0,0,0,0,1,0
3,4,60,0,0,26,-1.133158,79,0.610285,0,0.315509,...,0,0,1,0,0,0,0,0,0,1
4,5,25,1,1,29,1.238113,63,-0.05813,1,0.949482,...,0,0,0,0,1,0,0,0,0,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 41 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ID                                50000 non-null  int64  
 1   Age                               50000 non-null  int64  
 2   Gender                            50000 non-null  int64  
 3   Marital_Status                    50000 non-null  int64  
 4   Experience_Years                  50000 non-null  int64  
 5   Monthly_Salary_INR                50000 non-null  float64
 6   Working_Hours_per_Week            50000 non-null  int64  
 7   Commute_Time_Hours                50000 non-null  float64
 8   Remote_Work                       50000 non-null  int64  
 9   Stress_Level                      50000 non-null  float64
 10  Company_Size                      50000 non-null  int64  
 11  Sleep_Hours                       50000 non-null  float64
 12  Phys

In [18]:
x=df.drop(columns=['Stress_Level'])   # inputlarni tanlash

In [19]:
y=df['Stress_Level']

In [20]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)    # Datani qismlarga ajratish
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [21]:
model=LinearRegression()
linear_model=model.fit(x_train,y_train)
y_pred=linear_model.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [22]:
mse

1.0043242766722882

In [23]:
r2

-4.992456778940557e-05

In [24]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 5-Fold CV

# Perform Cross-Validation
cv_scores = cross_val_score(linear_model, x, y, cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [1.00259591 1.00545399 1.00870831 1.01005806 1.00688452 0.99416263
 0.99126132 1.00272082 0.99110305 0.99168393]
Mean CV Score: 1.0004632525648256


In [25]:
print(np.mean(cv_scores)-mse)

-0.003861024107462585
