In [None]:
# DATA 622 - Assignment

In [3]:
import pandas as pd
import matplotlib as plt
import seaborn as sns

# loading both dataset
dataset1_url = "https://raw.githubusercontent.com/hbedros/data622-assignment1/refs/heads/main/data/dataset-1.csv"
dataset2_url = "https://raw.githubusercontent.com/hbedros/data622-assignment1/refs/heads/main/data/dataset-2.csv"

dataset1 = pd.read_csv(dataset1_url)
dataset2 = pd.read_csv(dataset2_url)

print(dataset1.head())
print(dataset2.head())


   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0             23          84                  Low                High   
1             19          64                  Low              Medium   
2             24          98               Medium              Medium   
3             29          89                  Low              Medium   
4             19          92               Medium              Medium   

  Extracurricular_Activities  Sleep_Hours  Previous_Scores Motivation_Level  \
0                         No            7               73              Low   
1                         No            8               59              Low   
2                        Yes            7               91           Medium   
3                        Yes            8               98           Medium   
4                        Yes            6               65           Medium   

  Internet_Access  Tutoring_Sessions Family_Income Teacher_Quality  \
0             Ye

### Overview of the Datasets

We have two datasets to work with:

1. **Dataset 1 (dataset1.csv)**:
   - This one has a bunch of columns about students, like how many hours they studied, their attendance, and stuff like that.
   - It also includes some categories, like whether they had access to resources or if they were involved in extracurricular activities.
   - Overall, it gives a good picture of their backgrounds and behaviors.

2. **Dataset 2 (dataset2.csv)**:
   - This dataset focuses on the actual scores: Math, Reading, Writing, and a Placement Score.
   - It’s more about the outcomes of their studies.

### Similarities and Differences

**Similarities**:
- Both datasets deal with students and their academic stuff.
- They relate to how students perform in school, which is super relevant for the analysis.

**Differences**:
- Dataset 1 has a lot more details about students' experiences and environments.
- Dataset 2 is mainly about their scores, so it’s smaller and focused just on performance.

### Analyzing the Data

For the analysis, we can aim to predict how well students will do (like their scores) based on the info in Dataset 1. 

### Which Algorithms to Use

Here are a couple of machine learning algorithms that could work well:

1. **Linear Regression**:
   - This is good for predicting scores since it looks at relationships between things (like how study hours might affect scores).

2. **Random Forest Regression**:
   - This one’s a bit more complex and can handle a mix of numbers and categories. It's great if we think there might be non-linear relationships between the factors.

### Exploratory Analysis Report

## Matt's Role - Large Dataset

In [None]:
club_join_freq = dataset2['Club_Join_Date'].value_counts()
club_join_freq = pd.DataFrame(club_join_freq)
club_join_freq = club_join_freq.reset_index()
club_join_freq.columns = ['Join Date', 'Frequency']
display(club_join_freq)


The distribution of individuals who joined the club throughout the years is rather equivalent, with the highest frequency of enrollments being in 2019.

In [None]:
dataset2 = dataset2.sort_values(by = ['Club_Join_Date'])

Means = dataset2.groupby(['Club_Join_Date']).agg(['mean'])
Means.columns = ['_'.join(col).strip() for col in Means.columns]

Means = Means.reset_index()

display(Means)

fig, ax = plt.subplots(figsize=(15,10))

ax.plot(Means['Club_Join_Date'], Means['Math_Score_mean'], label = "Mean Math Score")
ax.plot(Means['Club_Join_Date'], Means['Writing_Score_mean'], label = "Mean Writing Score")
ax.plot(Means['Club_Join_Date'], Means['Placement_Score_mean'], label = "Mean Placement Score")

ax.set_xticks(Means['Club_Join_Date'])

plt.xlabel('Time in Years')
plt.ylabel('Mean Scores')
plt.title('Changes in Mean Scores Over Time')
plt.legend(bbox_to_anchor=(1,1), loc = 'center left')
plt.tight_layout()

plt.show()

We can see that average scores are rather consistent throughout the years and they tend to fluctuate between 79 through 81. Interestingly, writing and placement scores appear to decrease quite a bit in 2021. 

In [None]:
data_2018 = dataset2[dataset2['Club_Join_Date'] == 2018]

data2corr = dataset2.corr(method = 'pearson')

print(data2corr)

plt.figure(figsize=(8, 6))
sns.heatmap(data2corr, annot=True, cmap='coolwarm', fmt='.2f')

The variables present are not particularly correlated with each other, interestingly enough. The strongest relationships seen are between math score and writing score (r = 0.13), as well as club join date and reading date (r = -0.08). However, these relationshiops are rather weak and suggest a high degree of variance across scores and across years.

In [None]:
data_2018 = dataset2[dataset2['Club_Join_Date'] == 2018]
plt.scatter(data_2018['Math_Score'], data_2018['Reading_Score'])
plt.show

In [None]:
g1 = sns.FacetGrid(dataset2, col="Club_Join_Date")
g1.map(plt.scatter, 'Math_Score', 'Writing_Score')

g2 = sns.FacetGrid(dataset2, col="Club_Join_Date")
g2.map(plt.scatter, 'Math_Score', 'Placement_Score')

g3 = sns.FacetGrid(dataset2, col="Club_Join_Date")
g3.map(plt.scatter, 'Writing_Score', 'Placement_Score')

plt.show()


As we can see, these variables are not correlated, even within the same years.