<a href="https://colab.research.google.com/github/leman-cap13/kaggle_datasets_/blob/main/Social_Anxiety_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download natezhang123/social-anxiety-dataset

In [None]:
import zipfile
with zipfile.ZipFile('/content/social-anxiety-dataset.zip','r') as zip_ref:
  zip_ref.extractall()

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
df=pd.read_csv('/content/enhanced_anxiety_dataset.csv')
df

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_corr=df.corr(numeric_only=True)
plt.figure(figsize=(10,8))
sns.heatmap(df_corr,annot=True)
plt.show()

In [None]:
df['Anxiety Level (1-10)'].value_counts().plot(kind='bar')

In [None]:
plt.pie(df['Anxiety Level (1-10)'].value_counts(),labels=df['Anxiety Level (1-10)'].value_counts().index)
plt.show()

In [None]:
numeric_values=df.select_dtypes(include=['int64','float64'])
target_value=numeric_values.corrwith(df['Anxiety Level (1-10)'])
target_value

In [None]:
target_value=numeric_values.corr()['Anxiety Level (1-10)'].drop('Anxiety Level (1-10)')
highest_corr=target_value.sort_values(ascending=False).head()


plt.figure(figsize=(10,8))
sns.barplot(x=highest_corr.values,y=highest_corr.index)
plt.show()

In [None]:
# df.drop('Sleep Hours', axis=1,inplace=True)

In [None]:
corrs = df.corr(numeric_only=True)['Anxiety Level (1-10)'].drop('Anxiety Level (1-10)')
print(corrs.sort_values())

In [None]:
low_corr_features = corrs[abs(corrs) < 0.1].index.tolist()
df = df.drop(columns=low_corr_features)

In [None]:
df.head()

In [None]:
lowest_corr=target_value.sort_values(ascending=True)
lowest_corr

In [None]:
# def remove_outliers(df):
#   for column in df.select_dtypes(include=['int64','float64']):
#     Q1=df[column].quantile(0.25)
#     Q3=df[column].quantile(0.75)
#     IQR=Q3-Q1
#     upper_bound=Q3+1.5*IQR
#     lower_bound=Q1-1.5*IQR
#     df=df[(df[column]>=lower_bound) & (df[column]<=upper_bound)]
#   return df

In [None]:
# df=remove_outliers(df)

In [None]:
df['stress_sleep_ratio'] = df['Stress Level (1-10)'] / (df['Sleep Hours'] + 1e-3)
df['activity_vs_caffeine'] = df['Physical Activity (hrs/week)'] - (df['Caffeine Intake (mg/day)'] / 100)
df['healthy_lifestyle'] = df['Diet Quality (1-10)'] + df['Physical Activity (hrs/week)']
df['stress_hr_interaction'] = df['Stress Level (1-10)'] * df['Heart Rate (bpm)']

In [None]:
df['Anxiety Level (1-10)']=df['Anxiety Level (1-10)'].astype('int')

In [None]:
X=df.drop('Anxiety Level (1-10)',axis=1)
y=df['Anxiety Level (1-10)'].copy()

In [None]:
sns.histplot(y, bins=10)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
import numpy as np

In [None]:
num_features=X.select_dtypes(include=[np.number]).columns
cat_features=X.select_dtypes(exclude=[np.number]).columns

In [None]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
num_pipeline=Pipeline([
    ('impute',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

cat_pipeline=Pipeline([
    ('impute',SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('encoder',OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

transformer=ColumnTransformer([
    ('num',num_pipeline,num_features),
    ('cat',cat_pipeline,cat_features)
], remainder='passthrough')

estimator=RandomForestClassifier(random_state=42, class_weight='balanced')

full_pipeline=Pipeline([
    ('transformer',transformer),
    ('estimator',estimator)
])

In [None]:
full_pipeline.fit(X_train,y_train)

In [None]:
full_pipeline.score(X_train,y_train),  full_pipeline.score(X_test,y_test)

In [None]:
df

In [None]:
y_pred=full_pipeline.predict(X_test)

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
precision_score(y_pred, y_test, average='micro')

In [None]:
recall_score(y_pred, y_test, average='micro')

In [None]:
f1_score(y_pred, y_test, average='micro')

In [None]:
from sklearn.ensemble import RandomForestRegressor

estimator2=RandomForestRegressor(random_state=42)

full_pipeline_2=Pipeline([
    ('transformer',transformer),
    ('estimator2',estimator2)
])


In [None]:
full_pipeline_2.fit(X_train,y_train)

In [None]:
full_pipeline_2.score(X_train,y_train), full_pipeline_2.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LinearRegression

estimator3=LinearRegression()

full_pipeline_3=Pipeline([
    ('transformer',transformer),
    ('estimator3',estimator3)
])

In [None]:
full_pipeline_3.fit(X_train,y_train)

In [None]:
full_pipeline_3.score(X_train,y_train), full_pipeline_3.score(X_test, y_test)

In [None]:
from xgboost import XGBRegressor

estimator4=XGBRegressor(random_state=42)


full_pipeline_4=Pipeline([
    ('transformer',transformer),
    ('estimator4',estimator4)
])

In [None]:
full_pipeline_4.fit(X_train,y_train)

In [None]:
full_pipeline.score(X_train,y_train), full_pipeline.score(X_test,y_test)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
cat_model = CatBoostRegressor(
    iterations=700,
    learning_rate=0.05,
    depth=5,
    loss_function='RMSE',
    verbose=100,
    random_state=42
)

full_pipeline_cat = Pipeline([
    ('transformer', transformer),
    ('estimator', cat_model)
])

In [None]:
full_pipeline_cat.fit(X_train, y_train)

In [None]:
print("Train score:", full_pipeline_cat.score(X_train, y_train))
print("Test score:", full_pipeline_cat.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = full_pipeline_cat.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


In [None]:
# from sklearn.model_selection import GridSearchCV

# params = {
#     'estimator__depth': [ 6, 8,10],
#     'estimator__learning_rate': [0.01, 0.05, 0.1],
#     'estimator__iterations': [300, 500, 700]
# }

# grid = GridSearchCV(full_pipeline_cat, param_grid=params, cv=3)
# grid.fit(X_train, y_train)

# print("Ən yaxşı nəticə:", grid.best_score_)
# print("Ən yaxşı parametrlər:", grid.best_params_)


In [None]:
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

voting = VotingRegressor([
    ('cat', CatBoostRegressor(iterations=500, learning_rate=0.05, depth=5, verbose=0, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42))
])

final_pipeline = Pipeline([
    ('transformer', transformer),
    ('model', voting)
])

final_pipeline.fit(X_train, y_train)
print("Test Score (VotingRegressor):", final_pipeline.score(X_test, y_test))


In [None]:
final_pipeline.score(X_train, y_train)

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

stack = StackingRegressor(
    estimators=[
        ('xgb', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)),
        ('cat', CatBoostRegressor(iterations=500, learning_rate=0.05, depth=5, verbose=0, random_state=42)),
        ('rf', RandomForestRegressor(n_estimators=150, random_state=42))
    ],
    final_estimator=RidgeCV()
)

stack_pipeline = Pipeline([
    ('transformer', transformer),
    ('model', stack)
])

stack_pipeline.fit(X_train, y_train)

print("Stacking Test Score:", stack_pipeline.score(X_test, y_test))


In [None]:
stack_pipeline.score(X_train, y_train)

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(exclude=['int64', 'float64']).columns

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


pipeline = ImbPipeline([
    ('transformer', transformer),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])


pipeline.fit(X_train, y_train)


train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")


In [None]:
import lightgbm as lgb
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np



num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(exclude=['int64', 'float64']).columns


num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(y.unique()),
    random_state=42,
    n_estimators=500,
    learning_rate=0.05,
    class_weight='balanced'
)


pipeline = ImbPipeline([
    ('transformer', transformer),
    ('smote', SMOTE(random_state=42)),
    ('classifier', lgbm_clf)
])


pipeline.fit(X_train, y_train)


train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train accuracy: {train_score:.4f}")
print(f"Test accuracy: {test_score:.4f}")


y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()


cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    verbose=100,
    random_seed=42
)


cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), cat_features=cat_features, use_best_model=True)

y_pred = cat_model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
cat_model.score(X_train,y_train), cat_model.score(X_test,y_test)