# Milestone 1

# 1.1 Data Collection

In [1]:
import pandas as pd
import re 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
df = pd.read_csv('eduqg_llm_formatted.csv')


In [5]:
df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Accounting is sometimes called the “language o...,Wall Street,business,Main Street,financial statements,financial statements,B
1,1,What is a characteristic of Financial accounti...,should be incomplete in order to confuse compe...,should be prepared differently by each company,provides investors guarantees about the future,summarizes what has already occurred,should be incomplete in order to confuse compe...,D
2,2,Which of the following is not included in exte...,lenders such as bankers,governmental agencies such as the IRS,employees of a business,potential investors,governmental agencies such as the IRS,C
3,3,Which of the following groups would have acces...,bankers,investors,competitors of the business,managers,bankers,D
4,4,All of the following are examples of manageria...,preparing external financial statements in com...,deciding whether or not to use automation,making equipment repair or replacement decisions,measuring costs of production for each product...,deciding whether or not to use automation,A


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3369 entries, 0 to 3368
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3369 non-null   int64 
 1   prompt  3369 non-null   object
 2   A       3369 non-null   object
 3   B       3369 non-null   object
 4   C       3369 non-null   object
 5   D       3369 non-null   object
 6   E       3369 non-null   object
 7   answer  3369 non-null   object
dtypes: int64(1), object(7)
memory usage: 210.7+ KB


# 1.2 Data Preprocessing
# 1.3 Tools and libraries

In [7]:
df.isnull().sum()

id        0
prompt    0
A         0
B         0
C         0
D         0
E         0
answer    0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    
    processed_tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words
    ]
    
    return ' '.join(processed_tokens)

text_columns = ['prompt', 'A', 'B', 'C', 'D', 'E']
processed_data = {}

for col in text_columns:
    new_col_name = f'{col}_processed'
    df[new_col_name] = df[col].apply(preprocess_text)
    processed_data[col] = df[new_col_name]

print(df[['prompt', 'prompt_processed']].head())

                                              prompt  \
0  Accounting is sometimes called the “language o...   
1  What is a characteristic of Financial accounti...   
2  Which of the following is not included in exte...   
3  Which of the following groups would have acces...   
4  All of the following are examples of manageria...   

                                    prompt_processed  
0     accounting sometimes called language following  
1    characteristic financial accounting information  
2  following included external user financial acc...  
3  following group would access managerial accoun...  
4  following example managerial accounting activi...  


In [11]:
from sklearn.model_selection import train_test_split
X = df[[f'{col}_processed' for col in text_columns]]
y = df['answer'] 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

train_df.to_csv('cleaned_train_dataset.csv', index=False)
val_df.to_csv('cleaned_validation_dataset.csv', index=False)
