In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
def parse_grace_condone(entry:str)->int:
    if not isinstance(entry, str):
        return int(entry)
    
    # Check if the entry contains '*'
    if "*" in entry:
        parts = entry.split("*")
        # Extract the numeric part and multiply by the grace factor
        return int(parts[0]) + int(parts[1])
    # Check if the entry contains '@'
    if "@" in entry:
        parts = entry.split("@")
        # Extract the numeric part and add the condoned value
        return int(parts[0]) + int(parts[1])
    return entry

In [3]:
# Function to handle the entries of absent students
def clean_data(file):
    df = pd.read_csv(file)
    # # Clean up spaces in the Combined name column
    # df['Combined name'] = df['Combined name'].str.strip()  # Remove leading and trailing spaces
    # df['Combined name'] = df['Combined name'].replace('\s+', ' ', regex=True)  # Replace multiple spaces with a single space
    subject_columns  = [col for col in df.columns if 'BIT' in col]
    df = df[['StudentId', 'StudentName'] + subject_columns]
    df[subject_columns] = df[subject_columns].map(parse_grace_condone)
    
    absent_students = df.apply(lambda row: 'AB' in row.values, axis = 1)
    df.loc[absent_students] = df.loc[absent_students].replace('AB', 0)
    copy_case = df.apply(lambda row: 'CC' in row.values, axis = 1)
    df.loc[copy_case] = df.loc[copy_case].replace('CC', 0)

    df.select_dtypes('object').iloc[:, 1:].astype('int', errors='ignore')
    
    return df

In [4]:
def get_basic_info(df):
    print(f'Shape:\n{df.shape}\n\n')
    print(f'Description:\n{df.describe()}\n\n')
    print(f'Information:{df.info()}\n\n')

In [5]:
# Function to map credits based on marks
def calculate_credits(marks):
    return pd.cut(marks, bins=[0, 40, 45, 50, 55, 60, 70, 80, 101], labels=[0, 4, 5, 6, 7, 8, 9, 10], right=False)

# Function to adjust practical marks with credits criteria
def calculate_practical_percentage(marks):
    return marks//0.5

In [6]:
import re

def calculate_sgpa1(sem_data):
    subject_columns = [col for col in sem_data.columns if col.startswith(('INT', 'EXT'))]
    subject_columns = sorted(subject_columns)
    credits_df = pd.DataFrame()
    credits_df['StudentId'] = sem_data.StudentId
    credits_df['StudentName'] = sem_data.StudentName
    
    for subject_column in subject_columns:
        subject_name = subject_column.split('_')[1]
        total_marks = sem_data[subject_column].astype('int') + sem_data[subject_column.replace('EXT', 'INT')].astype('int')
        
        if re.match(r'BIT(\d{1})P(\d{1})', subject_name):
            # Apply the 'calculate_practical_percentage' function to the total marks
            total_marks = calculate_practical_percentage(total_marks)
            
        subject_columns.remove(subject_column.replace('EXT', 'INT'))
        
        # Convert percentage marks to GPA based on criteria
        gpa = calculate_credits(total_marks)

        # Add the calculated GPA to the credits DataFrame
        credits_df[subject_name] = gpa
    
    # Convert each column to numeric in the specified range
    numeric_columns = credits_df.iloc[:, 2:].apply(pd.to_numeric)

    # Sum across columns for each row
    total_credits = numeric_columns.sum(axis=1)

    # Calculate GPA by dividing total credits by 10
    gpa = total_credits / 10

    # Add the calculated GPA to the 'credits_df' DataFrame
    credits_df['GPA'] = gpa
    
    return credits_df

In [7]:
def get_combined_sgpa(*semesters: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame()
    df["StudentName"] = semesters[0]["StudentName"]

    semester_names = [f"Sem{i+1}" for i in range(len(semesters))]

    for semester, semester_name in zip(semesters, semester_names):
        df = pd.merge(
            df, semester[["StudentName", "GPA"]], on="StudentName", how="inner"
        )
        df = df.rename(columns={"GPA": semester_name})

    df.iloc[:, 1:] = df.iloc[:, 1:].map(lambda x: 0 if x < 4 else x)
    # df[semester_names] = df[semester_names].fillna(df[semester_names].median())

    return df

In [8]:
sem1_data_str = '../data/Sem1.csv'
sem2_data_str = '../data/Sem2.csv'
sem3_data_str = '../data/Sem3.csv'
sem4_data_str = '../data/Sem4.csv'
sem5_data_str = '../data/Sem5.csv'
df = pd.read_csv(sem1_data_str)
subject_columns  = [col for col in df.columns if 'BIT' in col]
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CourseName          277 non-null    object 
 1   SectionName         277 non-null    object 
 2   SessionName         277 non-null    object 
 3   StudentId           277 non-null    int64  
 4   ExamRollNumber      277 non-null    object 
 5   StudentName         277 non-null    object 
 6   INT_BIT102          277 non-null    int64  
 7   EXT_BIT102          277 non-null    int64  
 8   INT_BIT103          277 non-null    int64  
 9   EXT_BIT103          277 non-null    int64  
 10  INT_BIT1P1          277 non-null    int64  
 11  EXT_BIT1P1          277 non-null    object 
 12  INT_BIT1P3          277 non-null    int64  
 13  EXT_BIT1P3          277 non-null    int64  
 14  INT_BIT104          277 non-null    int64  
 15  EXT_BIT104          277 non-null    object 
 16  INT_BIT1

In [9]:
sem1_csv = pd.read_csv(sem1_data_str)
sem2_csv = pd.read_csv(sem2_data_str)
sem3_csv = pd.read_csv(sem3_data_str)
sem4_csv = pd.read_csv(sem4_data_str)
sem5_csv = pd.read_csv(sem5_data_str)

In [10]:
sem1_csv.head()

Unnamed: 0,CourseName,SectionName,SessionName,StudentId,ExamRollNumber,StudentName,INT_BIT102,EXT_BIT102,INT_BIT103,EXT_BIT103,...,EXT_BIT1P2,INT_BIT101,EXT_BIT101,Remark,Grade,TotalMarksObtained,TotalMarks,CreditsEarned,Percentage,SGPA
0,FYBSC (IT) 1,A,OCT-21,3736422,S1/FEB22/R/ 219004,ADDAGATLA ADARSH VENKATESH,31,48,31,46,...,12,30,43,PASS,A,501,750,20,66.8,8.0
1,FYBSC (IT) 1,A,OCT-21,3736426,S1/FEB22/R/ 219010,AMRITKAR PIYUSH HARISHCHANDRA,23,49,28,46,...,0,25,38,FAIL,F,402,750,14,53.6,5.2
2,FYBSC (IT) 1,A,OCT-21,3736430,S1/FEB22/R/ 219014,BAMBALE ANANT MARUTI,16,44,28,42,...,18,20,39,FAIL,F,410,750,18,54.67,6.3
3,FYBSC (IT) 1,A,OCT-21,3736432,S1/FEB22/R/ 219016,BELOSHE TANMAY AJIT,29,52,30,47,...,12,28,52,FAIL,F,465,750,18,62.0,6.6
4,FYBSC (IT) 1,A,OCT-21,3736434,S1/FEB22/R/ 219019,/BODWADE SRUSHTI VIKAS,25,28,17,24,...,0,23,36,FAIL,F,386,750,18,51.47,6.0


In [11]:
# Get the information about the columns present in the dataset
sem1_csv.columns

Index(['CourseName', 'SectionName', 'SessionName', 'StudentId',
       'ExamRollNumber', 'StudentName', 'INT_BIT102', 'EXT_BIT102',
       'INT_BIT103', 'EXT_BIT103', 'INT_BIT1P1', 'EXT_BIT1P1', 'INT_BIT1P3',
       'EXT_BIT1P3', 'INT_BIT104', 'EXT_BIT104', 'INT_BIT105', 'EXT_BIT105',
       'INT_BIT1P5', 'EXT_BIT1P5', 'INT_BIT1P4', 'EXT_BIT1P4', 'INT_BIT1P2',
       'EXT_BIT1P2', 'INT_BIT101', 'EXT_BIT101', 'Remark', 'Grade',
       'TotalMarksObtained', 'TotalMarks', 'CreditsEarned', 'Percentage',
       'SGPA'],
      dtype='object')

In [12]:
# Get the information about the dataset
sem1_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CourseName          277 non-null    object 
 1   SectionName         277 non-null    object 
 2   SessionName         277 non-null    object 
 3   StudentId           277 non-null    int64  
 4   ExamRollNumber      277 non-null    object 
 5   StudentName         277 non-null    object 
 6   INT_BIT102          277 non-null    int64  
 7   EXT_BIT102          277 non-null    int64  
 8   INT_BIT103          277 non-null    int64  
 9   EXT_BIT103          277 non-null    int64  
 10  INT_BIT1P1          277 non-null    int64  
 11  EXT_BIT1P1          277 non-null    object 
 12  INT_BIT1P3          277 non-null    int64  
 13  EXT_BIT1P3          277 non-null    int64  
 14  INT_BIT104          277 non-null    int64  
 15  EXT_BIT104          277 non-null    object 
 16  INT_BIT1

In [13]:
sem1_data = clean_data(sem1_data_str)
sem2_data = clean_data(sem2_data_str)
sem3_data = clean_data(sem3_data_str)
sem4_data = clean_data(sem4_data_str)
sem5_data = clean_data(sem5_data_str)
sem2_data.info()
# sem1_data.to_csv('../sem1.csv')
# sem2_data.to_csv('../sem2.csv')
# sem3_data.to_csv('../sem3.csv')
# sem4_data.to_csv('../sem4.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275 entries, 0 to 274
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   StudentId    275 non-null    int64 
 1   StudentName  275 non-null    object
 2   INT_BIT202   275 non-null    int64 
 3   EXT_BIT202   275 non-null    object
 4   INT_BIT2P3   275 non-null    int64 
 5   EXT_BIT2P3   275 non-null    int64 
 6   INT_BIT205   275 non-null    int64 
 7   EXT_BIT205   275 non-null    object
 8   INT_BIT2P5   275 non-null    int64 
 9   EXT_BIT2P5   275 non-null    int64 
 10  INT_BIT2P2   275 non-null    int64 
 11  EXT_BIT2P2   275 non-null    int64 
 12  INT_BIT2P4   275 non-null    int64 
 13  EXT_BIT2P4   275 non-null    int64 
 14  INT_BIT2P1   275 non-null    int64 
 15  EXT_BIT2P1   275 non-null    object
 16  INT_BIT203   275 non-null    int64 
 17  EXT_BIT203   275 non-null    object
 18  INT_BIT204   275 non-null    int64 
 19  EXT_BIT204   275 non-null    

In [14]:
sem1_sgpa = calculate_sgpa1(sem1_data)
sem2_sgpa = calculate_sgpa1(sem2_data)
sem3_sgpa = calculate_sgpa1(sem3_data)
sem4_sgpa = calculate_sgpa1(sem4_data)
sem5_sgpa = calculate_sgpa1(sem5_data)

# sem1_sgpa.to_csv('../sem1_sgpa.csv')
# sem2_sgpa.to_csv('../sem2_sgpa.csv')
# sem3_sgpa.to_csv('../sem3_sgpa.csv')
# sem4_sgpa.to_csv('../sem4_sgpa.csv')


In [15]:
df = get_combined_sgpa(sem1_sgpa, sem2_sgpa, sem3_sgpa, sem4_sgpa)
# df.to_csv('../demo.csv')
get_basic_info(df)
df[df['Sem1'] == df.Sem1.min()]

Shape:
(114, 5)


Description:
             Sem1        Sem2        Sem3        Sem4
count  114.000000  114.000000  114.000000  114.000000
mean     7.258772    5.021930    5.708772    5.914035
std      1.729075    3.059144    2.548106    2.933115
min      0.000000    0.000000    0.000000    0.000000
25%      6.500000    4.225000    4.925000    4.825000
50%      7.750000    6.100000    6.550000    6.950000
75%      8.375000    7.275000    7.100000    7.800000
max      9.900000    9.400000    9.800000    9.700000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   StudentName  114 non-null    object 
 1   Sem1         114 non-null    float64
 2   Sem2         114 non-null    float64
 3   Sem3         114 non-null    float64
 4   Sem4         114 non-null    float64
dtypes: float64(4), object(1)
memory usage: 4.6+ KB
Information:None




Unnamed: 0,StudentName,Sem1,Sem2,Sem3,Sem4
14,BHATT OM RAJ RAJKUMAR SHARMA,0.0,0.0,0.0,0.0
88,SHIRSAT BHUSHAN DILIP,0.0,0.0,0.0,4.1
105,YADAV ADITYA RAMAWADH,0.0,0.0,5.4,7.3


In [16]:
# get_basic_info(df)

In [17]:
# print(sem3_data[sem3_data['StudentName'] == 'MORE HITESH OMPRAKASH SUNITA'])
print(df[df['StudentName'] == 'SHARMA SIDDHARTH AJAY'])
print(df[df.isna().any(axis=1)].count())
# df.head(10)

               StudentName  Sem1  Sem2  Sem3  Sem4
107  SHARMA SIDDHARTH AJAY   8.7   8.6   9.1   9.4
StudentName    0
Sem1           0
Sem2           0
Sem3           0
Sem4           0
dtype: int64


In [18]:
# from sklearn.base import r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
import warnings
warnings.filterwarnings("ignore")

features = ['Sem1', 'Sem2', 'Sem3']
target = 'Sem4'

X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    evs = explained_variance_score(true, predicted)
    return mae, mse, rmse, r2_square, evs

In [20]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "Support Vector Regressor":SVR()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae ,model_train_mse, model_train_rmse, model_train_r2,model_train_evs = evaluate_model(y_train, y_train_pred)

    model_test_mae ,model_test_mse, model_test_rmse, model_test_r2,model_test_evs = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print("- Explained Variance Score: {:.4f}".format(model_train_evs))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    print("- Explained Variance Score: {:.4f}".format(model_test_evs))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1.7819
- Mean Squared Error: 3.1752
- Mean Absolute Error: 1.1840
- R2 Score: 0.6312
- Explained Variance Score: 0.6312
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.2701
- Mean Squared Error: 1.6133
- Mean Absolute Error: 0.9538
- R2 Score: 0.7987
- Explained Variance Score: 0.8024


Lasso
Model performance for Training set
- Root Mean Squared Error: 1.8482
- Mean Squared Error: 3.4159
- Mean Absolute Error: 1.3097
- R2 Score: 0.6033
- Explained Variance Score: 0.6033
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.3535
- Mean Squared Error: 1.8321
- Mean Absolute Error: 1.0826
- R2 Score: 0.7714
- Explained Variance Score: 0.7751


Ridge
Model performance for Training set
- Root Mean Squared Error: 1.7819
- Mean Squared Error: 3.1752
- Mean Absolute Error: 1.1846
- R2 Score: 0.6312
- Explained Variance Score: 0

In [21]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from tabulate import tabulate

# Assume 'evaluate_model' function is defined as before

features = ['Sem1', 'Sem2', 'Sem3']
target = 'Sem4'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "Support Vector Regressor": SVR()
}

table_data = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    model_train_mae, model_train_mse, model_train_rmse, model_train_r2, model_train_evs = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2, model_test_evs = evaluate_model(y_test, y_test_pred)

    table_data.append([model_name,
                       model_train_rmse, model_train_mse, model_train_mae, model_train_r2, model_train_evs,
                       model_test_rmse, model_test_mse, model_test_mae, model_test_r2, model_test_evs])

# Define column headers
headers = ["Model", "Train RMSE", "Train MSE", "Train MAE", "Train R2", "Train Expl. Variance",
           "Test RMSE", "Test MSE", "Test MAE", "Test R2", "Test Expl. Variance"]

# Display the table
print(tabulate(table_data, headers, tablefmt="fancy_grid"))


╒══════════════════════════╤══════════════╤═════════════╤═════════════╤════════════╤════════════════════════╤═════════════╤════════════╤════════════╤═══════════╤═══════════════════════╕
│ Model                    │   Train RMSE │   Train MSE │   Train MAE │   Train R2 │   Train Expl. Variance │   Test RMSE │   Test MSE │   Test MAE │   Test R2 │   Test Expl. Variance │
╞══════════════════════════╪══════════════╪═════════════╪═════════════╪════════════╪════════════════════════╪═════════════╪════════════╪════════════╪═══════════╪═══════════════════════╡
│ Linear Regression        │     1.7819   │   3.17516   │   1.18398   │   0.631222 │               0.631222 │     1.27015 │    1.61327 │   0.953818 │  0.798735 │              0.802386 │
├──────────────────────────┼──────────────┼─────────────┼─────────────┼────────────┼────────────────────────┼─────────────┼────────────┼────────────┼───────────┼───────────────────────┤
│ Lasso                    │     1.84821  │   3.41588   │   1.30973   

In [22]:
# Create and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = linear_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Use the model to predict SGPA for new data
new_data = {
    'Sem1': [8.7, 9.1, 8.9, 9.9, 8.4, 8.7, 8.9],
    'Sem2': [8.6, 8.5, 9.1, 9.4, 6.9, 9.0, 9.3],
    'Sem3': [9.2, 9.1, 9.5, 9.8, 7.6, 8.9, 8.6]
}

new_df = pd.DataFrame(new_data)

# Make predictions for the new data
new_predictions = list(np.round(linear_model.predict(new_df), decimals=1))
new_predictions=pd.Series(new_predictions)
new_df_predict = pd.concat([new_df, new_predictions], axis=1).rename(columns={0:'Sem4_predicited'})

# Display the predictions
print('\nPredictions for new data:')
print(new_predictions)
new_df_predict

Mean Squared Error: 1.6132699182900754
R-squared: 0.7987350394843148

Predictions for new data:
0    9.1
1    8.9
2    9.4
3    9.5
4    7.5
5    9.1
6    9.0
dtype: float64


Unnamed: 0,Sem1,Sem2,Sem3,Sem4_predicited
0,8.7,8.6,9.2,9.1
1,9.1,8.5,9.1,8.9
2,8.9,9.1,9.5,9.4
3,9.9,9.4,9.8,9.5
4,8.4,6.9,7.6,7.5
5,8.7,9.0,8.9,9.1
6,8.9,9.3,8.6,9.0


In [23]:
df

Unnamed: 0,StudentName,Sem1,Sem2,Sem3,Sem4
0,ADDAGATLA ADARSH VENKATESH,7.8,4.7,5.4,6.7
1,/BODWADE SRUSHTI VIKAS,6.0,0.0,0.0,0.0
2,SAMANTRA JAGANNATH SIMANCHAL,8.4,7.3,7.6,8.9
3,MANE VAIBHAV UDAYSINGH,8.5,0.0,5.5,5.0
4,JADHAV SHUBHAM ANIL,5.5,0.0,0.0,0.0
...,...,...,...,...,...
109,/TIWARI POOJA MANISH,7.9,6.2,7.1,6.9
110,MAURYA AMIT VIJAYBAHADUR,5.8,0.0,5.4,4.5
111,/GUPTA SEJAL KAMLESH,6.5,4.9,6.1,8.9
112,SINGH ANIKET GYANENDRA KUMAR,7.7,5.3,6.4,6.4


In [24]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score

features = ['Sem1', 'Sem2', 'Sem3']
target = 'Sem4'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

ridge_model = Ridge()

param_grid = {'alpha': np.logspace(-6, 6, 1000) }

grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Train the Ridge Regression model with the best hyperparameters
sem4_model = Ridge(alpha=best_alpha)
sem4_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = sem4_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Explained Variance Score: {evs}')


Best Alpha: 80.15006961565398
Mean Squared Error: 1.3934898452323556
R-squared: 0.8109051216343581
Explained Variance Score: 0.8124490520363934


In [25]:
# Use the model to predict SGPA for new data
new_data = {
    'Sem1': [8.7, 9.1, 8.9, 9.9, 8.4, 8.7, 8.9],
    'Sem2': [8.6, 8.5, 9.1, 9.4, 6.9, 9.0, 9.3],
    'Sem3': [9.2, 9.1, 9.5, 9.8, 7.6, 8.9, 8.6]
}

new_df = pd.DataFrame(new_data)

# Make predictions for the new data
new_predictions = list(np.round(sem4_model.predict(new_df), decimals=1))
new_predictions=pd.Series(new_predictions)
new_df_predict = pd.concat([new_df, new_predictions], axis=1).rename(columns={0:'Sem4_predicited'})

# Display the predictions
print('\nPredictions for new data:')
print(new_predictions)
new_df_predict


Predictions for new data:
0    8.8
1    8.7
2    9.1
3    9.3
4    7.4
5    8.9
6    8.9
dtype: float64


Unnamed: 0,Sem1,Sem2,Sem3,Sem4_predicited
0,8.7,8.6,9.2,8.8
1,9.1,8.5,9.1,8.7
2,8.9,9.1,9.5,9.1
3,9.9,9.4,9.8,9.3
4,8.4,6.9,7.6,7.4
5,8.7,9.0,8.9,8.9
6,8.9,9.3,8.6,8.9


In [26]:
import cloudpickle

with open("../models/sem4_model.pkl", 'wb') as f:
    cloudpickle.dump(sem4_model, f)
    f.close()

In [27]:
df = get_combined_sgpa(sem1_sgpa, sem2_sgpa, sem3_sgpa)

In [28]:
features = ['Sem1', 'Sem2']
target = 'Sem3'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

ridge_model = Ridge()

param_grid = {'alpha': np.logspace(-6, 6, 1000) }

grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Train the Ridge Regression model with the best hyperparameters
sem3_model = Ridge(alpha=best_alpha)
sem3_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = sem3_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Explained Variance Score: {evs}')


Best Alpha: 84.70868266557402
Mean Squared Error: 1.8549481620301362
R-squared: 0.6509270870764388
Explained Variance Score: 0.6512157071148301


In [29]:
import cloudpickle

with open("../models/sem3_model.pkl", 'wb') as f:
    cloudpickle.dump(sem3_model, f)
    f.close()

In [30]:
df = get_combined_sgpa(sem1_sgpa, sem2_sgpa)
features = ['Sem1']
target = 'Sem2'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

ridge_model = Ridge()

param_grid = {'alpha': np.logspace(-6, 6, 1000) }

grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Train the Ridge Regression model with the best hyperparameters
sem2_model = Ridge(alpha=best_alpha)
sem2_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = sem2_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Explained Variance Score: {evs}')


Best Alpha: 1e-06
Mean Squared Error: 5.655830208850036
R-squared: 0.5263839739117373
Explained Variance Score: 0.536472277192187


In [31]:
import cloudpickle

with open("../models/sem2_model.pkl", 'wb') as f:
    cloudpickle.dump(sem2_model, f)
    f.close()

In [33]:
df = get_combined_sgpa(sem1_sgpa, sem2_sgpa, sem3_sgpa, sem4_sgpa, sem5_sgpa)
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score

features = ['Sem1', 'Sem2', 'Sem3', 'Sem4']
target = 'Sem5'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

ridge_model = Ridge()

param_grid = {'alpha': np.logspace(-6, 6, 1000) }

grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Train the Ridge Regression model with the best hyperparameters
sem5_model = Ridge(alpha=best_alpha)
sem5_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = sem5_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Explained Variance Score: {evs}')


Best Alpha: 67.89406812696113
Mean Squared Error: 1.6789907901141168
R-squared: 0.4315393574005214
Explained Variance Score: 0.46125777082164343


In [None]:
import cloudpickle

with open("../models/sem5_model.pkl", 'wb') as f:
    cloudpickle.dump(sem5_model, f)
    f.close()

In [31]:
# import os
# from tempfile import NamedTemporaryFile


# data = pd.DataFrame({
#         'StudentId': [1, 2],
#         'StudentName': ['Student1', 'Student2'],
#         'BIT1': ['80', 'CC'],
#         'BIT2': ['AB', '75']
#     })
# # data = data.to_csv()
# # cleaned_data = clean_data(data)
# with NamedTemporaryFile(mode='w', delete=False, suffix='.csv', newline='') as temp_csv:
#     data.to_csv(temp_csv, index=False)
#     # Use the temporary CSV file for testing
# print(pd.read_csv(temp_csv.name))
# cleaned_data = clean_data(temp_csv.name)
# os.remove(temp_csv.name)
# cleaned_data

In [32]:
# import re
# import numpy as np

# def calculate_sgpa1_test(_sem_data_):

#     subject_columns = [col for col in _sem_data_.columns if col.startswith(('INT', 'EXT'))]
#     subject_columns = sorted(subject_columns)

#     credits_df = pd.DataFrame()
#     credits_df['StudentId'] = _sem_data_.StudentId
#     credits_df['StudentName'] = _sem_data_.StudentName

#     for subject_column in subject_columns:
#         subject_name = subject_column.split('_')[1]
#         total_marks = _sem_data_[subject_column].astype('int') + _sem_data_[subject_column.replace('EXT', 'INT')].astype('int')

#         if re.match(r'BIT(\d{1})P(\d{1})', subject_name):
#             # Apply the 'calculate_practical_percentage' function to the total marks
#             total_marks = calculate_practical_percentage(total_marks)

#         subject_columns.remove(subject_column.replace('EXT', 'INT'))

#         # Convert percentage marks to GPA based on criteria
#         gpa = calculate_credits(total_marks)

#         # Add the calculated GPA to the credits DataFrame
#         credits_df[subject_name] = gpa
#     print(credits_df)
#     # Convert each column to numeric in the specified range
#     numeric_columns = credits_df.iloc[:, 2:].apply(pd.to_numeric)

#     # Set GPA to 0 for students with any value less than 4
#     gpa = numeric_columns.sum(axis=1)
#     gpa[numeric_columns.lt(4).any(axis=1)] = 0

#     # Add the calculated GPA to the 'credits_df' DataFrame
#     credits_df['GPA'] = gpa / 10

#     return credits_df

# data = {
#     'StudentId': [1, 2, 3],
#     'StudentName': ['Alice', 'Bob', 'Charlie'],
#     'INT_BIT301': [16, 12, 15],
#     'EXT_BIT301': [46, 34, 25],
#     'INT_BIT302': [17, 14, 4],
#     'EXT_BIT302': [53, 41, 22],
#     'INT_BIT3P1': [15, 10, 15],
#     'EXT_BIT3P1': [20, 25, 25]
# }

# # Test the function
# result = calculate_sgpa1_test(pd.DataFrame(data))
# result

   StudentId StudentName BIT301 BIT302 BIT3P1
0          1       Alice      8      9      9
1          2         Bob      5      7      9
2          3     Charlie      4      0     10


Unnamed: 0,StudentId,StudentName,BIT301,BIT302,BIT3P1,GPA
0,1,Alice,8,9,9,2.6
1,2,Bob,5,7,9,2.1
2,3,Charlie,4,0,10,0.0
