In [None]:
import pandas as pd

In [None]:
resumedata = pd.read_csv('Resume Dataset including names.csv')
print(resumedata.head)

In [None]:
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

#Preprocessing Textual Data
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

resumedata['Text'] = resumedata['Text'].apply(preprocess_text)

print(resumedata[['Text']].head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
X = vectorizer.fit_transform(resumedata['Text'])
print(X.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(resumedata['Category'])
print(y[:10])

In [None]:
#Splitting the Dataset into 80% training and 20% testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
#K-Nearest-Neighbors Classifier with 3 clusters
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f'Accuracy on training set: {knn.score(X_train, y_train):.2f}')
print(f'Accuracy on test set: {knn.score(X_test, y_test):.2f}')
print("\nClassification report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
#Plotting Confusion Matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
!pip install wordcloud

In [None]:
import re

baby_names = pd.read_csv('baby-names.csv')

#Processing the names and ensuring that none of the words that were previously selected (that were not names) are selected.
baby_names_set = set(baby_names['name'].str.lower()) 

non_name_terms = {'texas', 'green', 'page', 'junior', 'senior', 'mr', 'ms', 'dr', 'first', 'last', 'place', 'may', 'angeles'}

#Taking the first name from the dataset and only selecting the first valid name.
def extract_first_valid_name_and_predict_gender(text, baby_names_set, non_name_terms):
    normalized_text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    words_in_text = normalized_text.split()
    for word in words_in_text:
        if word in baby_names_set and word not in non_name_terms:
            predicted_gender = baby_names[baby_names['name'].str.lower() == word]['sex'].iloc[0]
            return predicted_gender, word
    return 'unknown', None

#Adding prediction to the resumedata set
def apply_gender_prediction(resumedata, baby_names_set, non_name_terms):
    predicted_genders = []
    extracted_names = []
    
    for text in resumedata['Text']:
        predicted_gender, found_name = extract_first_valid_name_and_predict_gender(text, baby_names_set, non_name_terms)
        predicted_genders.append(predicted_gender)
        extracted_names.append(found_name)
    resumedata['predicted_gender'] = predicted_genders
    resumedata['extracted_names'] = extracted_names

apply_gender_prediction(resumedata, baby_names_set, non_name_terms)

print(resumedata[['Text', 'predicted_gender', 'extracted_names']].head())

In [None]:
import ethnicolr

#Need a dummy column for last name in order for Ethnicolr to work
resumedata['dummy_last_name'] = ''  

#Run resumedata set through ethicolr to predict ethnicity
resumedata = ethnicolr.pred_wiki_name(resumedata, fname_col='extracted_names', lname_col='dummy_last_name')

print(resumedata.columns)

ethnicity_columns = [col for col in resumedata.columns if "GreaterEuropean" in col or "WestEuropean" in col]  # Adjust this based on the actual columns

resumedata['predicted_ethnicity'] = resumedata[ethnicity_columns].idxmax(axis=1)

print(resumedata[['Text', 'extracted_names', 'predicted_gender', 'predicted_ethnicity']].head())

In [None]:
#Checking counts for gender and ethnicity
gender_counts = resumedata['predicted_gender'].value_counts()
ethnicity_counts = resumedata['predicted_ethnicity'].value_counts()

print("Gender Distribution:\n", gender_counts)
print("\nEthnicity Distribution:\n", ethnicity_counts)

In [None]:
#Creating Bar Chart showing the quantities of each ethnicity and gender in each job category
ethnicity_mapping = {
    0: 'GreaterEuropean,British',
    1: 'GreaterEuropean,EastEuropean',
    2: 'GreaterEuropean,Jewish',
    3: 'GreaterEuropean,WestEuropean,French',
    4: 'GreaterEuropean,WestEuropean,Germanic',
    5: 'GreaterEuropean,WestEuropean,Hispanic',
    6: 'GreaterEuropean,WestEuropean,Italian',
    7: 'GreaterEuropean,WestEuropean,Nordic'
}


resumedata['ethnicity_label'] = resumedata['encoded_ethnicity'].map(ethnicity_mapping)

#Plot distribution of ethnicities
plt.figure(figsize=(12, 8))
sns.countplot(data=resumedata, x='encoded_category', hue='ethnicity_label', palette='Set1')
plt.title('Ethnicity Distribution in Each Job Category')
plt.xlabel('Job Category')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.legend(title='Ethnicity', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#Plot distribution of gender
plt.figure(figsize=(12, 8))
sns.countplot(data=resumedata, x='encoded_category', hue='predicted_gender', palette='Set2')
plt.title('Gender Distribution in Each Job Category')
plt.xlabel('Job Category')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.legend(title='Gender', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
#Creating Heatmaps to detail the number of each ethnicity and gender per job category
filtered_data_gender = resumedata[~resumedata['predicted_gender'].isin([None, 'unknown'])]
filtered_data_ethnicity = resumedata[~resumedata['predicted_ethnicity'].isin([None, 'unknown'])]

gender_counts = filtered_data_gender.groupby(['Category', 'predicted_gender']).size().reset_index(name='count')

#Pivot Plot for gender and ethnicity distribution
gender_pivot = gender_counts.pivot_table(index='Category', columns='predicted_gender', values='count', aggfunc='sum', fill_value=0)

#Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(gender_pivot, annot=True, fmt='d', cmap='Blues', cbar_kws={'label': 'Number of Candidates'}, annot_kws={'size': 10})
plt.title('Distribution of Job Categories by Predicted Gender (Excluding Null and Unknown)', fontsize=14)
plt.xlabel('Predicted Gender', fontsize=12)
plt.ylabel('Job Category', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


ethnicity_counts = filtered_data_ethnicity.groupby(['Category', 'predicted_ethnicity']).size().reset_index(name='count')

ethnicity_pivot = ethnicity_counts.pivot_table(index='Category', columns='predicted_ethnicity', values='count', aggfunc='sum', fill_value=0)

plt.figure(figsize=(12, 8))
sns.heatmap(ethnicity_pivot, annot=True, fmt='d', cmap='YlGnBu', cbar_kws={'label': 'Number of Candidates'}, annot_kws={'size': 10})
plt.title('Distribution of Job Categories by Predicted Ethnicity (Excluding Null and Unknown)', fontsize=14)
plt.xlabel('Predicted Ethnicity', fontsize=12)
plt.ylabel('Job Category', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
from wordcloud import WordCloud

#Creating Wordcloud to enhance specific words used by either gender

#Filter through data of male and female candidates and plot Wordclouds
male_data = resumedata[resumedata['predicted_gender'] == 'boy']
female_data = resumedata[resumedata['predicted_gender'] == 'girl']

male_text = " ".join(male_data['Text'].dropna())
female_text = " ".join(female_data['Text'].dropna())

male_wordcloud = WordCloud(width=800, height=400, background_color="white").generate(male_text)
female_wordcloud = WordCloud(width=800, height=400, background_color="white").generate(female_text)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(male_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for Males")
plt.subplot(1, 2, 2)
plt.imshow(female_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for Females")

plt.show()

In [None]:
#Filtering data and listing the top 5 words for each gender 
if 'predicted_gender' not in resumedata.columns or 'Text' not in resumedata.columns:
    print("Error: Required columns 'predicted_gender' or 'Text' are missing in the DataFrame.")
else:
    male_data = resumedata[resumedata['predicted_gender'] == 'boy']
    female_data = resumedata[resumedata['predicted_gender'] == 'girl']
    male_text = " ".join(male_data['Text'].dropna())
    female_text = " ".join(female_data['Text'].dropna())

    male_word_counts = Counter(male_text)
    female_word_counts = Counter(female_text)
    
    top_5_male = male_word_counts.most_common(5)
    top_5_female = female_word_counts.most_common(5)
    
    print("Top 5 words for Males:")
    for word, count in top_5_male:
        print(f"{word}: {count}")
    
    print("\nTop 5 words for Females:")
    for word, count in top_5_female:
        print(f"{word}: {count}")

In [None]:
#Filter through ethnicities and plot wordclouds for all
if 'predicted_ethnicity' not in resumedata.columns:
    print("Error: 'predicted_ethnicity' column not found in the DataFrame")
else:
    ethnicities = resumedata['predicted_ethnicity'].unique()
    num_ethnicities = len(ethnicities)
    rows = int(np.ceil(num_ethnicities / cols)) 

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5))
    axes = axes.flatten() 

    for idx, ethnicity in enumerate(ethnicities):
        ethnicity_data = resumedata[resumedata['predicted_ethnicity'] == ethnicity]
        text_data = " ".join(ethnicity_data['Text'].dropna())
        
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)
        ax = axes[idx]
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis('off')
        ax.set_title(f'WordCloud for {ethnicity}', fontsize=12)
    for idx in range(num_ethnicities, len(axes)):
        axes[idx].axis('off')  

    plt.tight_layout()
    plt.show()

In [None]:
#Filtering Ethnicity data to determine the top 5 words for each
if 'predicted_ethnicity' not in resumedata.columns or 'Text' not in resumedata.columns:
    print("Error: Required columns 'predicted_ethnicity' or 'Text' are missing in the DataFrame.")
else:
    ethnicities = resumedata['predicted_ethnicity'].unique()

    for ethnicity in ethnicities:
        ethnicity_data = resumedata[resumedata['predicted_ethnicity'] == ethnicity]
        
        all_text = " ".join(ethnicity_data['Text'].dropna())
        
        word_counts = Counter(all_text)

        top_5_words = word_counts.most_common(5)

        print(f"Top 5 words for ethnicity '{ethnicity}':")
        for word, count in top_5_words:
            print(f"{word}: {count}")
        print()
