In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# load and merge file
train_label_file = pd.read_csv("/kaggle/input/predict-student-performance-from-game-play/train_labels.csv")
train_label_file['q'] = train_label_file.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
train_label_file['session_id'] = train_label_file['session_id'].apply(lambda x: int(x.split('_')[0]))
train_label_file.to_csv('/kaggle/working/train_labels_split.csv', index=False)

In [None]:
train_file = open('/kaggle/input/predict-student-performance-from-game-play/train.csv', 'r')
train_label_file = open('/kaggle/working/train_labels_split.csv', 'r')

In [None]:

train_header = train_file.readline().strip().split(',')
train_label_header = train_label_file.readline().strip().split(',')

# Read train file into memory and create a dictionary with ID as the key
train_data_dict = {}
for train_line in train_file:
    train_data = train_line.strip().split(',')
    id_value = train_data[0]  # Assuming the 'id' field is the first column
    train_data_dict[id_value] = train_data

output_file = open('/kaggle/working/merged_data.csv', 'w')
output_file.write(','.join(train_header + train_label_header) + '\n')

# Merge train and train_label data using the train_data_dict
for train_label_line in train_label_file:
    train_label_data = train_label_line.strip().split(',')
    id_value = train_label_data[0]  # Assuming the 'id' field is the first column

    if id_value in train_data_dict:
        train_data = train_data_dict[id_value]
        merged_line = ','.join(train_data + train_label_data)
        output_file.write(merged_line + '\n')
#         print(f"Merged line for ID: {id_value}")
    else:
        print(f"No matching train data found for ID: {id_value}")

output_file.close()

In [None]:

# Define the chunk size
chunk_size = 100000

# Initialize a list to store the columns with null records
null_columns = []
# Iterate over the CSV file in chunks
for chunk in pd.read_csv('/kaggle/working/merged_data.csv', chunksize=chunk_size):
    # Identify columns with null records in the current chunk
    null_cols = chunk.columns[chunk.isnull().any()].tolist()
    null_columns.extend(null_cols)

# Remove duplicate column names
null_columns = list(set(null_columns))

# Read the CSV file again, loading only the required columns
df = pd.read_csv('/kaggle/working/merged_data.csv', usecols=lambda col: col not in null_columns)

# Save the modified DataFrame to a new CSV file
df.to_csv('/kaggle/working/modified_file.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
input_dim = 11  # Number of input features
# Build your neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(input_dim,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Read the CSV file in chunks and split into train/validation sets
chunk_size = 100000
split_ratio = 0.8

train_data = pd.DataFrame()
val_data = pd.DataFrame()
train_data_types = {
"session_id": str,
"index": np.int32,
"elapsed_time": np.int32,
"event_name": str,
"name": str,
"level": str,
"room_fqid": str,
"fullscreen": np.int32,
"hq": np.int32,
"music": np.int32,
"level_group ": str,
"session_id.1": str,
"correct": np.int32,
"q": np.int32,
}
count =0
for chunk in pd.read_csv('/kaggle/working/modified_file.csv', chunksize=chunk_size, dtype= train_data_types):
    # Preprocess your data as needed
    chunk = chunk.drop('session_id.1', axis=1)
    chunk = chunk.drop('q', axis=1)
    label_encoder = LabelEncoder()
    object_columns = chunk.select_dtypes(include=['object']).columns
    for column in object_columns:
        chunk[column] = label_encoder.fit_transform(chunk[column])
    int_columns = chunk.select_dtypes(include='int64').columns
    chunk[int_columns] = chunk[int_columns].astype(np.int32)  # Convert int64 to int32
    str_columns = chunk.select_dtypes(include='object').columns
    chunk[str_columns] = chunk[str_columns].astype(str)  # Convert object to string

    chunk_train, chunk_val = train_test_split(chunk, train_size=split_ratio)
    train_data = pd.concat([train_data, chunk_train], ignore_index=True)
    val_data = pd.concat([val_data, chunk_val], ignore_index=True)

    # Preprocess your data as needed (e.g., scaling, encoding, etc.)
    # Split the features and labels
    X_train = train_data.drop('correct', axis=1).values
    y_train = train_data['correct'].values
    X_val = val_data.drop('correct', axis=1).values
    y_val = val_data['correct'].values
    # Train the model
    print(f"chunk {count}")
    model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))
    count=count+1
    # Evaluate the model
    loss, accuracy = model.evaluate(X_val, y_val)
    print('Validation Loss:', loss)
    print('Validation Accuracy:', accuracy)


In [None]:
# Load the test data
test_data_types = {
"session_id": str,
"index": np.int32,
"elapsed_time": np.int32,
"event_name": str,
"name": str,
"level": str,
"room_fqid": str,
"fullscreen": np.int32,
"hq": np.int32,
"music": np.int32,
"level_group ": str,
"correct": np.int32,
}
test_data = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/test.csv', usecols=["session_id","index","elapsed_time","event_name","name","level","room_fqid","fullscreen","hq","music","level_group"],dtype= test_data_types)

# Preprocess the test data using the same label encoder used during training
label_encoder = LabelEncoder()
object_columns = test_data.select_dtypes(include=['object']).columns
for column in object_columns:
    test_data[column] = label_encoder.fit_transform(test_data[column])
int_columns = test_data.select_dtypes(include='int64').columns
test_data[int_columns] = test_data[int_columns].astype(np.int32)  # Convert int64 to int32
str_columns = test_data.select_dtypes(include='object').columns
test_data[str_columns] = test_data[str_columns].astype(str)  # Convert object to string
# Get the test features
X_test = test_data.values
# Make predictions on the test data
predictions = model.predict(X_test)
submission_df = pd.DataFrame({'Prediction': predictions.flatten()})
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
"User": {f"Hi, I'm having trouble with [specific issue or task]. \\
    Can you help me troubleshoot or provide guidance on how to resolve it? "}

"Assistant": {"Sure, I'd be happy to assist you. \\
              Please provide me with some additional information about the problem you're facing. What specific error messages or symptoms are you encountering? Have you tried any troubleshooting steps so far? 
              The more details you can provide, \\\
              the better I can assist you in resolving the issue."}