In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Data loading and preprocessing

file_path = '2024_03_numunit_withOMOPtarget.csv'
data = pd.read_csv(file_path)
data.dropna(subset=['description'], inplace=True)

# Separate the data into training and prediction datasets
train_data = data.dropna(subset=['source_code_description']).copy()
predict_data = data[data['source_code_description'].isna()].copy()

# Drop rows with NaN values in 'description' column from the training data
train_data.dropna(subset=['description'], inplace=True)
predict_data.dropna(subset=['description'], inplace=True)

# drop 'percent' and 'score' rows from `source_code_description` in training data
train_data = train_data[~train_data['source_code_description'].isin(['percent', 'score',])]


In [3]:
# inspecting train and test data

"""
print(train_data.info(verbose=True)) # looks fine; has correct amount of rows
print(predict_data.info(verbose=True)) # looks fine; has correct amount of rows
"""

# exporting samples of the train and test data
"""
train_sample = train_data.sample(n=200, random_state=1)
train_sample.to_csv('processing/train_sample_large.csv', index=False)

pred_sample = predict_data.sample(n=200, random_state=1)
pred_sample.to_csv('processing/pred_sample.csv', index=False)
"""

"\ntrain_sample = train_data.sample(n=200, random_state=1)\ntrain_sample.to_csv('processing/train_sample_large.csv', index=False)\n\npred_sample = predict_data.sample(n=200, random_state=1)\npred_sample.to_csv('processing/pred_sample.csv', index=False)\n"

In [4]:
# Encoding feature and label layers

# Label encode the target variable for training data
label_encoder = LabelEncoder()
train_data['target'] = label_encoder.fit_transform(train_data['source_code_description'])

# Vectorize the 'description' column using Count Vectorizer for training data
vectorizer = CountVectorizer(binary=True)
X_train = vectorizer.fit_transform(train_data['description'])

# Extract the target variable for training data
y_train = train_data['target']


In [5]:
# inspecting the feature and label layers

# viewing X_train
"""
copied_X_train = X_train.copy()

# Convert the sparse matrix to a dense matrix (numpy array)
dense_matrix = copied_X_train.toarray()

# Create a DataFrame from the dense matrix
df = pd.DataFrame(dense_matrix)

# Export the DataFrame to a CSV file
csv_file_path = 'processing/sparse_matrix_dense.csv'
df.to_csv(csv_file_path, index=False)

print(f'Sparse matrix exported as dense matrix to {csv_file_path}')
"""

# viewing list of tokens found during vectorization

"""
# Get the list of all tokens found in the data
vocabulary = vectorizer.vocabulary_
tokens = list(vocabulary.keys())

# Sort tokens by their index to make it more readable
sorted_tokens = sorted(tokens, key=lambda x: vocabulary[x])

# Print the sorted list of tokens
print(sorted_tokens)

# If you want to save the tokens to a file for further inspection
with open('processing/tokens_list.csv', 'w') as f:
    for token in sorted_tokens:
        f.write(f"{token}\n")
"""

'\n# Get the list of all tokens found in the data\nvocabulary = vectorizer.vocabulary_\ntokens = list(vocabulary.keys())\n\n# Sort tokens by their index to make it more readable\nsorted_tokens = sorted(tokens, key=lambda x: vocabulary[x])\n\n# Print the sorted list of tokens\nprint(sorted_tokens)\n\n# If you want to save the tokens to a file for further inspection\nwith open(\'processing/tokens_list.csv\', \'w\') as f:\n    for token in sorted_tokens:\n        f.write(f"{token}\n")\n'

In [6]:
# Define the hyperparameters

solver = 'newton-cg'   
C = 0.01          
penalty = 'l2'    
max_iter = 10000   
multi_class = 'multinomial'   
class_weight = 'balanced'

In [7]:
# Train a multinomial logistic regression model
model = LogisticRegression(
    solver=solver,
    C=C,
    penalty=penalty,
    max_iter=max_iter,
    multi_class=multi_class,
    class_weight=class_weight 
)
model.fit(X_train, y_train)

# Vectorize the 'description' column for prediction data
X_predict = vectorizer.transform(predict_data['description'])

# Predict the 'source_code_description' for the prediction data
y_predict = model.predict(X_predict)
predict_data['predicted_source_code_description'] = label_encoder.inverse_transform(y_predict)

# Combine the results back into the original dataset for comparison or further analysis
result_data = pd.concat([train_data, predict_data], ignore_index=True)

# Display the predictions for the missing data
#print(predict_data[['description', 'predicted_source_code_description']])

# If you want to save the results to a new CSV file
result_data.to_csv('processing/predicted_2024_03_numunit_withOMOPtarget.csv', index=False)