In [None]:
import pandas as pd
from sqlalchemy import create_engine
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler
from joblib import dump, load

### <span style='color:Pink'>Extract all skills from database</span> 

In [None]:
db_config = {
    "host": "localhost",
    "user": "root",
    "password": "myhien2004",
    "database": "sample"
}

table_name = "job_scrape"

output_csv_path = "skills.csv"

try:
    # Create an SQLAlchemy engine
    engine = create_engine(f"mysql+mysqlconnector://{db_config['user']}:{db_config['password']}@{db_config['host']}/{db_config['database']}")

    # Query to fetch data from the table
    query = f"SELECT skills FROM {table_name}"
    df = pd.read_sql(query, engine)

    # Export data to CSV
    df.to_csv(output_csv_path, index=False)

    print(f"Data exported successfully to {output_csv_path}")
except Exception as e:
    print(f"Error: {e}")
finally:
    engine.dispose()

### <span style='color:Pink'>Reformat the skills.csv file</span> 

In [None]:
csv_file_path = "D:\Beloiter\Sophomore\Data Clinic\skills.csv"

# Open the CSV file
with open(csv_file_path, "r") as f:
    csv_reader = csv.reader(f)
    skills_lst = list(csv_reader)

skills_temp = skills_lst[1:]
# print(skills_temp)


skills = []
for i in skills_temp:
    for j in i:
        temp = j.split(",")
        for skill in temp:
            skills.append(skill.strip())

print(skills)

### <span style='color:Pink'>Extract unlabeled skills</span> 

In [None]:
skills_dict = {
    "skill": [],
}

for i in range(0, len(skills)):
    skills_dict["skill"].append(skills[i])

df = pd.DataFrame(skills_dict)
df.to_csv("uncleaned_skills.csv", index=True)
# print(skills_dict)

### <span style='color:Pink'>Train a skill classification model</span> 

In [None]:
# Load the labeled data
labeled_data = pd.read_csv('cleaned_skills4.csv', encoding='latin-1', on_bad_lines='skip')

# Separate features (skills) and labels (skill groups)
X_labeled = labeled_data['skill']
y = labeled_data['skill_group']

# Text Preprocessing using TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(X_labeled)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.5, random_state=42)

# Initialize the SVM classifier
svm_classifier = SVC()

# Define the parameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best SVM model and parameters
best_svm_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

### <span style='color:Pink'>Save and load the trained model</span> 

In [None]:
# Save the trained model and TF-IDF vectorizer
dump(best_svm_model, 'best_svm_classifier_model.joblib')
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Load the trained model and TF-IDF vectorizer
best_svm_model = load('best_svm_classifier_model.joblib')
tfidf_vectorizer = load('tfidf_vectorizer.joblib')

### <span style='color:Pink'>Return predicted results</span> 

In [None]:
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print("\nTested Results:")
print(results_df)

### <span style='color:Pink'>Save and load the trained model</span> 

In [None]:
# Print incorrectly predicted rows
incorrect_indices = y_test != predictions
incorrect_df = labeled_data.loc[y_test.index[incorrect_indices]]
print("Incorrectly Predicted Rows:")
print(incorrect_df)

### <span style='color:Pink'>Classify one skill</span> 

In [None]:
test_skill = ["Analyze Information"]
test_skill_tfidf = tfidf_vectorizer.transform(test_skill)
prediction = best_svm_model.predict(test_skill_tfidf)
print("Predicted Skill Group:", prediction[0])

### <span style='color:Pink'>Classify all unlabeled skills</span> 

In [None]:
unclassified_data = pd.read_csv('unclassified_skills4.csv', encoding='latin-1', on_bad_lines='skip')

X_unclassified = unclassified_data['skill']

X_tfidf_unclassified = tfidf_vectorizer.transform(X_unclassified)

predicted_labels = best_svm_model.predict(X_tfidf_unclassified)

unclassified_data['predicted_skill_group'] = predicted_labels

# Save the results to a new CSV file or perform further analysis as needed
unclassified_data.to_csv('predicted_skills4.csv', index=False)

### <span style='color:Pink'>Import finalized skill table to MySQL</span> 

In [None]:
engine = create_engine("mysql+mysqlconnector://root:myhien2004@localhost:3306/sample", echo=False)
csv_file_path = 'finalized_skill_classification.csv'
df = pd.read_csv(csv_file_path)
table_name = 'skills'
df.to_sql(table_name, con=engine, if_exists='replace', index=False)