In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import LinearSVC

In [2]:
# Data loading and preprocessing

file_path = '2024_03_numunit_withOMOPtarget_synonyms_stage2.csv'
data = pd.read_csv(file_path)
data.dropna(subset=['description'], inplace=True)

# Separate the data into training and prediction datasets
train_data = data.dropna(subset=['source_code_description']).copy()
predict_data = data[data['source_code_description'].isna()].copy()


# Drop rows with NaN values in 'description_clean' column from the training data
train_data.dropna(subset=['description_clean'], inplace=True)
predict_data.dropna(subset=['description_clean'], inplace=True)

# drop 'percent' and 'score' rows from `source_code_description` in training data
#train_data = train_data[~train_data['source_code_description'].isin(['percent', 'score',])]


In [3]:
# inspecting train and test data


print(data.info(verbose=True))

print(train_data.info(verbose=True)) # looks fine; has correct amount of rows

print(predict_data.info(verbose=True)) # looks fine; has correct amount of rows


# exporting samples of the train and test data

train_sample = train_data.sample(n=200, random_state=1)
train_data.to_csv('processing/train_data.csv', index=False)
pred_sample = predict_data.sample(n=200, random_state=1)
pred_sample.to_csv('processing/pred_sample.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29324 entries, 0 to 29323
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   numunitid                29324 non-null  int64 
 1   obsval                   29324 non-null  int64 
 2   source_code_description  3852 non-null   object
 3   description              29324 non-null  object
 4   synonyms                 3809 non-null   object
 5   description_clean        29281 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3850 entries, 0 to 27515
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   numunitid                3850 non-null   int64 
 1   obsval                   3850 non-null   int64 
 2   source_code_description  3850 non-null   object
 3   description              3850 non-nu

In [4]:
# Encoding feature and label layers

# Label encode the target variable for training data
label_encoder = LabelEncoder()
train_data['target'] = label_encoder.fit_transform(train_data['source_code_description'])

# Vectorize the 'description_clean' column using Count Vectorizer for training data
vectorizer = CountVectorizer(binary=True)
X_train = vectorizer.fit_transform(train_data['description_clean'])

# Extract the target variable for training data
y_train = train_data['target']


In [5]:
# Assuming y_train is the array of target labels for your training data
classes = np.unique(y_train)

# Automatically calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}


`compute_class_weight`: This function automatically calculates the appropriate class weights based on the frequency of each class in the training data.
`class_weight='balanced'`: Tells the function to compute class weights inversely proportional to the class frequencies. This means that rarer classes get higher weights.

In [6]:
# inspecting the feature and label layers

# viewing X_train
"""
copied_X_train = X_train.copy()

# Convert the sparse matrix to a dense matrix (numpy array)
dense_matrix = copied_X_train.toarray()

# Create a DataFrame from the dense matrix
df = pd.DataFrame(dense_matrix)

# Export the DataFrame to a CSV file
csv_file_path = 'processing/sparse_matrix_dense.csv'
df.to_csv(csv_file_path, index=False)

print(f'Sparse matrix exported as dense matrix to {csv_file_path}')
"""

# viewing list of tokens found during vectorization

"""
# Get the list of all tokens found in the data
vocabulary = vectorizer.vocabulary_
tokens = list(vocabulary.keys())

# Sort tokens by their index to make it more readable
sorted_tokens = sorted(tokens, key=lambda x: vocabulary[x])

# Print the sorted list of tokens
print(sorted_tokens)

# If you want to save the tokens to a file for further inspection
with open('processing/tokens_list.csv', 'w') as f:
    for token in sorted_tokens:
        f.write(f"{token}\n")
"""

'\n# Get the list of all tokens found in the data\nvocabulary = vectorizer.vocabulary_\ntokens = list(vocabulary.keys())\n\n# Sort tokens by their index to make it more readable\nsorted_tokens = sorted(tokens, key=lambda x: vocabulary[x])\n\n# Print the sorted list of tokens\nprint(sorted_tokens)\n\n# If you want to save the tokens to a file for further inspection\nwith open(\'processing/tokens_list.csv\', \'w\') as f:\n    for token in sorted_tokens:\n        f.write(f"{token}\n")\n'

In [17]:
# Define the hyperparameters


C = 1     # small C (<1) gives strong regularization to correct overfitting; large C (>100) gives weak regularization to correct underfitting
penalty = 'l2'    #l2 = ridge
max_iter = 20000
solver = "LinearSVC"
multi_class = 'multinomial'
class_weight = class_weights_dict

In [15]:
# Train a multinomial logistic regression model
model = LinearSVC(
    
    C=1,
    
    max_iter=max_iter,
    
    class_weight=class_weight 
)
model.fit(X_train, y_train)

LinearSVC(C=1,
          class_weight={0: 20.263157894736842, 1: 20.263157894736842,
                        2: 20.263157894736842, 3: 20.263157894736842,
                        4: 20.263157894736842, 5: 5.065789473684211,
                        6: 1.1919504643962848, 7: 0.010488176964149503,
                        8: 10.131578947368421, 9: 6.754385964912281,
                        10: 20.263157894736842, 11: 1.8421052631578947,
                        12: 20.263157894736842, 13: 20.263157894736842,
                        14: 20.263157894736842, 15: 20.263157894736842,
                        16: 5.065789473684211, 17: 6.754385964912281,
                        18: 6.754385964912281, 19: 0.8810068649885584,
                        20: 6.754385964912281, 21: 0.8810068649885584,
                        22: 10.131578947368421, 23: 20.263157894736842,
                        24: 5.065789473684211, 25: 10.131578947368421,
                        26: 0.8105263157894737, 27: 0.6754385964

In [18]:
# evaluate performance on the training data

# Predict on the training data
y_train_pred = model.predict(X_train)

# Evaluate the model's performance
train_accuracy = accuracy_score(y_train, y_train_pred)
classification_rep = classification_report(y_train, y_train_pred, target_names=label_encoder.classes_)

print(f"Training Accuracy: {train_accuracy}")
print("Classification Report:\n", classification_rep)

Training Accuracy: 0.9436363636363636
Classification Report:
                                                          precision    recall  f1-score   support

                                          Bodansky unit       1.00      1.00      1.00         1
              Bone DXA Calcaneus [T-score] Bone density       0.50      1.00      0.67         1
                                                Decibel       1.00      1.00      1.00         1
                                        Farad per liter       0.02      1.00      0.04         1
                                          avidity index       1.00      1.00      1.00         1
                                                billion       0.43      0.75      0.55         4
                                billion cells per liter       0.74      1.00      0.85        17
                                      billion per liter       1.00      0.97      0.98      1932
                                 billion per milliliter       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# applying the model to the predict data and outputting results

# Vectorize the 'description_clean' column for prediction data
X_predict = vectorizer.transform(predict_data['description_clean'])
# Predict the 'source_code_description' for the prediction data
y_predict = model.predict(X_predict)

# Predict the decision function values for the 'source_code_description' for the prediction data
decision_values = model.decision_function(X_predict)
# Initialize lists to store the final predictions and decision values
final_predictions = []
final_decision_values = []

# Add the predictions to the predict_data DataFrame
predict_data['predicted_source_code_description'] = label_encoder.inverse_transform(y_predict)

# Combine the results back into the original dataset for comparison or further analysis
result_data = pd.concat([train_data, predict_data], ignore_index=True)

# If you want to save the results to a new CSV file
result_data.to_csv(f'processing/LinearSVC/predicted_2024_03_numunit_withOMOPtarget_{solver}_{C}.csv', index=False)

print("Final predictions have been saved.")

Final predictions have been saved.


In [11]:
# i can't remember what i was trying in this cell...

"""

# Train a multinomial logistic regression model with new variables
new_vectorizer = CountVectorizer(binary=True)
X_train_new = new_vectorizer.fit_transform(train_data['description'])
y_train_new = label_encoder.transform(train_data['source_code_description'])

new_model = LogisticRegression(
    solver='saga',   
    C=10,     
    penalty='l2',    
    max_iter=1000,
    multi_class='multinomial',
    class_weight=None 
)
new_model.fit(X_train_new, y_train_new)

# Predict on the training data using the trained model
y_train_pred_new = new_model.predict(X_train_new)
train_data['predicted_source_code_description'] = label_encoder.inverse_transform(y_train_pred_new)

# Vectorize the 'description' column for the prediction data
X_predict_new = new_vectorizer.transform(predict_data['description'])

# Predict the 'source_code_description' for the prediction data
y_predict_new = new_model.predict(X_predict_new)
predict_data['predicted_source_code_description'] = label_encoder.inverse_transform(y_predict_new)

# Combine the results back into the original dataset for comparison or further analysis
result_data_new = pd.concat([train_data, predict_data], ignore_index=True)

# Save the results to a new CSV file
result_data_new.to_csv('processing/MLR/predicted_2024_03_numunit_withOMOPtarget_saga_10_traindata.csv', index=False)
"""

"\n\n# Train a multinomial logistic regression model with new variables\nnew_vectorizer = CountVectorizer(binary=True)\nX_train_new = new_vectorizer.fit_transform(train_data['description'])\ny_train_new = label_encoder.transform(train_data['source_code_description'])\n\nnew_model = LogisticRegression(\n    solver='saga',   \n    C=10,     \n    penalty='l2',    \n    max_iter=1000,\n    multi_class='multinomial',\n    class_weight=None \n)\nnew_model.fit(X_train_new, y_train_new)\n\n# Predict on the training data using the trained model\ny_train_pred_new = new_model.predict(X_train_new)\ntrain_data['predicted_source_code_description'] = label_encoder.inverse_transform(y_train_pred_new)\n\n# Vectorize the 'description' column for the prediction data\nX_predict_new = new_vectorizer.transform(predict_data['description'])\n\n# Predict the 'source_code_description' for the prediction data\ny_predict_new = new_model.predict(X_predict_new)\npredict_data['predicted_source_code_description'] = 

In [12]:
print("Script executed")

Script executed
