# Library Importation

In [59]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm

# Data Importation and Exploration

In [4]:
## import data into pandas dataframe 
df = pd.read_csv('/kaggle/input/student-stress-factors/Student Stress Factors.csv')

##checking the columns of the data 
columns = df.columns
columns

                

Index(['Timestamp', 'Kindly Rate your Sleep Quality 😴',
       'How many times a week do you suffer headaches 🤕?',
       'How would you rate you academic performance 👩‍🎓?',
       'how would you rate your study load?',
       'How many times a week you practice extracurricular activities 🎾?',
       'How would you rate your stress levels?'],
      dtype='object')

In [5]:
## checking the structure and data types of the datframe 
data_structure = df.info()
data_structure

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 7 columns):
 #   Column                                                            Non-Null Count  Dtype 
---  ------                                                            --------------  ----- 
 0   Timestamp                                                         53 non-null     object
 1   Kindly Rate your Sleep Quality 😴                                  53 non-null     int64 
 2   How many times a week do you suffer headaches 🤕?                  53 non-null     int64 
 3   How would you rate you academic performance 👩‍🎓?                  53 non-null     int64 
 4   how would you rate your study load?                               53 non-null     int64 
 5   How many times a week you practice extracurricular activities 🎾?  53 non-null     int64 
 6   How would you rate your stress levels?                            53 non-null     int64 
dtypes: int64(6), object(1)
memory usage: 3.0+ KB


In [6]:
## checking the summary of the dataframe
summary = df.describe()
summary

Unnamed: 0,Kindly Rate your Sleep Quality 😴,How many times a week do you suffer headaches 🤕?,How would you rate you academic performance 👩‍🎓?,how would you rate your study load?,How many times a week you practice extracurricular activities 🎾?,How would you rate your stress levels?
count,53.0,53.0,53.0,53.0,53.0,53.0
mean,3.150943,1.981132,3.226415,2.811321,2.886792,2.792453
std,1.199298,1.263246,1.154281,1.428509,1.449939,1.377826
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,3.0,2.0,2.0,2.0
50%,3.0,1.0,3.0,2.0,3.0,3.0
75%,4.0,3.0,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0,5.0,5.0


In [7]:
## checking the first five(5) rows of the dataframe
first_5_rows = df.head(5)
first_5_rows

Unnamed: 0,Timestamp,Kindly Rate your Sleep Quality 😴,How many times a week do you suffer headaches 🤕?,How would you rate you academic performance 👩‍🎓?,how would you rate your study load?,How many times a week you practice extracurricular activities 🎾?,How would you rate your stress levels?
0,27/10/2023 21:54:15,3,1,3,4,2,3
1,28/10/2023 12:24:40,4,1,2,3,3,2
2,28/10/2023 12:24:51,2,1,2,1,4,4
3,28/10/2023 12:26:11,3,2,3,2,3,3
4,28/10/2023 12:26:45,2,3,1,5,5,3


# Data Cleaning and Validation

In [8]:
## checking for null values 
null_values = df.isnull().sum()
null_values

Timestamp                                                           0
Kindly Rate your Sleep Quality 😴                                    0
How many times a week do you suffer headaches 🤕?                    0
How would you rate you academic performance 👩‍🎓?                    0
how would you rate your study load?                                 0
How many times a week you practice extracurricular activities 🎾?    0
How would you rate your stress levels?                              0
dtype: int64

In [9]:
## checking for duplicated rows 
duplicated_rows = df.duplicated().sum()
duplicated_rows

0

# Data Exploration Analysis 

In [97]:
df

Unnamed: 0,Timestamp,Kindly Rate your Sleep Quality 😴,How many times a week do you suffer headaches 🤕?,How would you rate you academic performance 👩‍🎓?,how would you rate your study load?,How many times a week you practice extracurricular activities 🎾?,How would you rate your stress levels?,Performance Prediction
0,27/10/2023 21:54:15,3,1,3,4,2,3,3
1,28/10/2023 12:24:40,4,1,2,3,3,2,3
2,28/10/2023 12:24:51,2,1,2,1,4,4,2
3,28/10/2023 12:26:11,3,2,3,2,3,3,3
4,28/10/2023 12:26:45,2,3,1,5,5,3,2
5,28/10/2023 12:31:02,3,1,3,2,1,1,3
6,28/10/2023 12:34:45,3,5,1,4,3,5,2
7,28/10/2023 12:35:43,4,3,1,4,1,1,3
8,28/10/2023 12:36:07,2,1,4,4,5,1,3
9,28/10/2023 12:36:20,1,2,3,2,5,2,2


## Histogram for Sleep Quality Rating"

In [10]:
fig = px.histogram(df, x='Kindly Rate your Sleep Quality 😴')
fig.update_layout(
    title="Sleep Quality Rating",
    xaxis_title="Sleep Quality",
    yaxis_title="Frequency"
)

fig.show()

## Histogram for Days with Headaches 

In [11]:
fig = px.histogram(df, x='How many times a week do you suffer headaches 🤕?')
fig.update_layout(
    title="Days with Headaches",
    xaxis_title="Days with Headaches",
    yaxis_title="Frequency"
)

fig.show()

## Histogram for Academic Rating

In [12]:
fig = px.histogram(df, x='How would you rate you academic performance 👩‍🎓?')
fig.update_layout(
    title="Academic Perfomance Rating",
    xaxis_title="Academic Rating",
    yaxis_title="Frequency"
)

fig.show()


## Histogram for Study Load Ratings 

In [13]:
fig = px.histogram(df, x='how would you rate your study load?')
fig.update_layout(
    title="Study Load Rating",
    xaxis_title="Study Load Rating",
    yaxis_title="Frequency"
)

fig.show()


## Histogram for Extracurrilcular Activity

In [14]:
fig = px.histogram(df, x='How many times a week you practice extracurricular activities 🎾?')
fig.update_layout(
    title="Extracurrilcular Activity Rating",
    xaxis_title="Extracurrilcular Activity Rating",
    yaxis_title="Frequency"
)

fig.show()

## Histogram for Stress Level Rating 

In [15]:
fig = px.histogram(df, x='How would you rate your stress levels?')
fig.update_layout(
    title="Stress Level Rating",
    xaxis_title="Stress Level Rating",
    yaxis_title="Frequency"
)

fig.show()

# Correlation Analysis 

In [16]:
df.columns

Index(['Timestamp', 'Kindly Rate your Sleep Quality 😴',
       'How many times a week do you suffer headaches 🤕?',
       'How would you rate you academic performance 👩‍🎓?',
       'how would you rate your study load?',
       'How many times a week you practice extracurricular activities 🎾?',
       'How would you rate your stress levels?'],
      dtype='object')

In [17]:
correlation = df[['Kindly Rate your Sleep Quality 😴',
       'How many times a week do you suffer headaches 🤕?',
       'how would you rate your study load?',
        'How would you rate you academic performance 👩‍🎓?',          
       'How many times a week you practice extracurricular activities 🎾?',
       'How would you rate your stress levels?']].corr()
correlation

Unnamed: 0,Kindly Rate your Sleep Quality 😴,How many times a week do you suffer headaches 🤕?,how would you rate your study load?,How would you rate you academic performance 👩‍🎓?,How many times a week you practice extracurricular activities 🎾?,How would you rate your stress levels?
Kindly Rate your Sleep Quality 😴,1.0,0.027303,0.095518,0.266565,-0.001043,0.286995
How many times a week do you suffer headaches 🤕?,0.027303,1.0,0.104557,-0.115711,-0.179676,-0.035439
how would you rate your study load?,0.095518,0.104557,1.0,0.073057,0.054481,0.341232
How would you rate you academic performance 👩‍🎓?,0.266565,-0.115711,0.073057,1.0,0.050081,0.005932
How many times a week you practice extracurricular activities 🎾?,-0.001043,-0.179676,0.054481,0.050081,1.0,0.180536
How would you rate your stress levels?,0.286995,-0.035439,0.341232,0.005932,0.180536,1.0


# Linear Regression Analysis 


## Splitting the data into X and Y 

In [18]:
df.columns

Index(['Timestamp', 'Kindly Rate your Sleep Quality 😴',
       'How many times a week do you suffer headaches 🤕?',
       'How would you rate you academic performance 👩‍🎓?',
       'how would you rate your study load?',
       'How many times a week you practice extracurricular activities 🎾?',
       'How would you rate your stress levels?'],
      dtype='object')

In [30]:
x = df[['Kindly Rate your Sleep Quality 😴',
       'How many times a week do you suffer headaches 🤕?',
       'how would you rate your study load?',
       'How many times a week you practice extracurricular activities 🎾?',
       'How would you rate your stress levels?']].to_numpy()

In [29]:
y = df['How would you rate you academic performance 👩‍🎓?'].to_numpy().reshape(-1,1)

## Training the model 

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 10)

In [75]:
model= LinearRegression()

## Fitting the Varables into the Model 

In [76]:
model.fit(x_train,y_train)

## Linear Regression Coefficient 

In [77]:
linear_coefficient = model.coef_
linear_coefficient

array([[ 0.19577561, -0.11549465,  0.02763273, -0.0420948 , -0.10584436]])

## Academic Performance Prediction 

In [78]:
performance_prediction = model.predict(x).astype(int)


## Coefficient Determination (r2)

In [79]:
coefficient_determination = r2_score(y,performance_prediction)
coefficient_determination

-0.19798474945533795

R-squared value of -0.19798474945533795 means that the model's predictions are not explaining much of the variance in academic performance and, in fact, may be performing poorly.

# Ordinary Least Square Model 

In [80]:
# Add a constant (intercept) to the independent variables
X = df[['Kindly Rate your Sleep Quality 😴',
       'How many times a week do you suffer headaches 🤕?',
       'how would you rate your study load?',
       'How many times a week you practice extracurricular activities 🎾?',
       'How would you rate your stress levels?']]

X = sm.add_constant(X)


In [81]:
# Create an OLS model
model = sm.OLS(y, X)

In [82]:
# Fit the model to your data
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.103
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1.078
Date:,"Sun, 05 Nov 2023",Prob (F-statistic):,0.385
Time:,06:06:18,Log-Likelihood:,-79.425
No. Observations:,53,AIC:,170.9
Df Residuals:,47,BIC:,182.7
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.5306,0.671,3.770,0.000,1.180,3.881
Kindly Rate your Sleep Quality 😴,0.2856,0.139,2.054,0.046,0.006,0.565
How many times a week do you suffer headaches 🤕?,-0.1194,0.129,-0.923,0.361,-0.380,0.141
how would you rate your study load?,0.0798,0.120,0.667,0.508,-0.161,0.321
How many times a week you practice extracurricular activities 🎾?,0.0352,0.114,0.309,0.758,-0.194,0.264
How would you rate your stress levels?,-0.1052,0.130,-0.807,0.423,-0.367,0.157

0,1,2,3
Omnibus:,0.873,Durbin-Watson:,1.692
Prob(Omnibus):,0.646,Jarque-Bera (JB):,0.828
Skew:,-0.287,Prob(JB):,0.661
Kurtosis:,2.788,Cond. No.,27.7


 ## Key points to note include:

R-squared (R^2): This is a measure of how well the model explains the variance in the dependent variable. In this case, the R-squared is 0.103, indicating that the independent variables explain approximately 10.3% of the variance in the dependent variable.


F-statistic and Prob (F-statistic): The F-statistic tests the overall significance of the model. In this case, the F-statistic is 1.078 with a probability of 0.385, which suggests that the model may not be statistically significant.


Coefficients (coef): These values represent the estimated coefficients for each independent variable in the model. The standard errors, t-statistics, and p-values are also provided to assess the statistical significance of each coefficient.


AIC and BIC: These are information criteria used for model selection. Lower values indicate a better-fitting model.


Omnibus, Durbin-Watson, Jarque-Bera, Skew, and Kurtosis: These are various statistical tests and measures related to the model's assumptions and fit. For example, the Durbin-Watson value checks for autocorrelation in the residuals.


Cond. No.: This represents the condition number, which can indicate potential multicollinearity (high correlation between independent variables).

In [96]:
df['Performance Prediction'] = performance_prediction
df

Unnamed: 0,Timestamp,Kindly Rate your Sleep Quality 😴,How many times a week do you suffer headaches 🤕?,How would you rate you academic performance 👩‍🎓?,how would you rate your study load?,How many times a week you practice extracurricular activities 🎾?,How would you rate your stress levels?,Performance Prediction
0,27/10/2023 21:54:15,3,1,3,4,2,3,3
1,28/10/2023 12:24:40,4,1,2,3,3,2,3
2,28/10/2023 12:24:51,2,1,2,1,4,4,2
3,28/10/2023 12:26:11,3,2,3,2,3,3,3
4,28/10/2023 12:26:45,2,3,1,5,5,3,2
5,28/10/2023 12:31:02,3,1,3,2,1,1,3
6,28/10/2023 12:34:45,3,5,1,4,3,5,2
7,28/10/2023 12:35:43,4,3,1,4,1,1,3
8,28/10/2023 12:36:07,2,1,4,4,5,1,3
9,28/10/2023 12:36:20,1,2,3,2,5,2,2


### Please upvote if this was useful 