Convert all '.doc' files to '.docx' files (easier to read & parse) 

In [75]:
from glob import glob
import re
import os
import win32com.client as win32
from win32com.client import constants

# Create list of paths to .doc files
paths = glob(os.getcwd() + '\\Graduating Directory of 7th Semester ( Graduate Book )\\**\\*.doc', recursive=True)

def save_as_docx(path):
    # Opening MS Word
    word = win32.gencache.EnsureDispatch('Word.Application')
    doc = word.Documents.Open(path)
    doc.Activate ()

    # Rename path with .docx
    new_file_abs = os.path.abspath(path)
    new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)

    # Save and Close
    word.ActiveDocument.SaveAs(
        new_file_abs, FileFormat=constants.wdFormatXMLDocument
    )
    doc.Close(False)

for path in paths:
    if os.path.exists(f'{path}x'):
        continue
    print(f'Creating: {path}x')
    save_as_docx(path)

Create corpus

In [76]:
import string
import re
string.punctuation
#defining the function to remove punctuation
def remove_punctuation(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    return ' '.join(text.split())

def remove_newlines(text):
    return ' '.join(text.splitlines())

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [78]:
import textract
import sys

encoding = sys.getdefaultencoding()

paths = glob(os.getcwd() + '\\Graduating Directory of 7th Semester ( Graduate Book )\\**\\*.docx', recursive=True)
corpus = []

showed_once = False
for path in paths:
    try:
        text = textract.process(path)
        text = text.decode(encoding)
        # text preprocessing
        text = remove_punctuation(text)
        text = remove_newlines(text)
        text = remove_numbers(text)
        if not showed_once:
            print(f'Document text becomes in this format: (Order of sentence does not matter)\n{text}')
            showed_once = True
        corpus.append(text)
    except:
        print(f'\nUnable to process file: {path}\n')

Document text becomes in this format: (Order of sentence does not matter)
University of Sargodha Department of Computer Science Information Technology Falakniaz EDUCATION QUALIFICATION University of Sargodha   BSCS Honors Government College Peshawar Pre Engineering Subjects Mathematics Physic Chemistry Khyber Model High School  Matriculation Science PROJECTS Research Project Virtual Assistant Hologram Artificial Intelligence Using Python Computer Networking Database Systems Library Management System Website Development ACTIVITIES HONORS AWARDS Member Event Management Society University of Sargodha  SKILLS TOOLS Tools Visual Studio SQL Server Management Studio Microsoft Office Coral Photoshop Microsoft Visual Skills Adobe Photoshop Adobe Illustrator Adobe XD Adobe premiere Pro Web Development HTML  CSS Bootstrap JavaScript ASP net PHP Languages C C C HTML CSS Bootstrap JavaScript PHP INTERESTS Reading Cricket volunteering in university events BSCS  Date of Birth  May  Business Address C

Fit TF-IDF vectorizer

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer(lowercase=False)
v.fit(corpus)

In [80]:
# Print the idf of each word:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

ABDEN : 5.799914262780603
ABDUL : 5.799914262780603
ABDULLAH : 5.799914262780603
ACCP : 5.799914262780603
ACTIVITIES : 1.3340061441260194
ADP : 5.799914262780603
ADPCS : 5.799914262780603
ADS : 5.799914262780603
AES : 5.1067670822206574
AFZAL : 5.799914262780603
AHSAN : 5.799914262780603
AI : 4.295836866004329
AIR : 5.799914262780603
AKHTAR : 5.799914262780603
AKRAM : 5.799914262780603
ALEEHA : 5.799914262780603
ALEENA : 5.799914262780603
ALI : 4.413619901660713
ALSO : 5.799914262780603
AMMAR : 5.799914262780603
ANAM : 5.799914262780603
AND : 5.799914262780603
ANZA : 5.799914262780603
API : 5.394449154672439
APPlications : 5.799914262780603
APSACS : 5.799914262780603
AQSA : 5.799914262780603
AREEJ : 5.799914262780603
ARIF : 5.799914262780603
AROOJ : 5.799914262780603
ARSLAN : 5.394449154672439
ARZOO : 5.799914262780603
ASAD : 5.799914262780603
ASHRAF : 5.1067670822206574
ASIM : 5.799914262780603
ASP : 3.785011242238338
ATM : 4.883623530906448
AUG : 5.799914262780603
AWARDS : 1.33976984

Read dataset and clean + preprocess

In [81]:
import pandas as pd

#read the data into a pandas dataframe
df = pd.read_excel("all_company_data.xlsx")

df = df.fillna('')
df['skills'] = df['company_domain'] + ' ' + df['company_expertise']
df['skills'] = df['skills'].apply(remove_punctuation)
df = df.drop(labels=['web-scraper-order', 'web-scraper-start-url', 'pages', 'company_domain', 'company_expertise'], axis=1)

indices = list(range(df.shape[0]))
str_indices = [str(i) for i in indices]
df = df.set_index([pd.Index(str_indices)])

df.head(5)

Unnamed: 0,company_name,company_location,skills
0,MTechSoft,Faisalabad,eCommerce
1,MTechSoft,Faisalabad,Mobile Development
2,MTechSoft,Faisalabad,UI UX Design Creative
3,MTechSoft,Faisalabad,Web Development
4,MTechSoft,Faisalabad,Angular Js


Give each company an id (ML works with numbers only)

In [82]:
#Add the new column which gives a unique number to each of these labels 
companies = df['company_name'].unique()
companies = {companies[i]:i for i in range(len(companies))}

df['company_num'] = df['company_name'].map(companies)

#checking the results 
df.head(5)

Unnamed: 0,company_name,company_location,skills,company_num
0,MTechSoft,Faisalabad,eCommerce,0
1,MTechSoft,Faisalabad,Mobile Development,0
2,MTechSoft,Faisalabad,UI UX Design Creative,0
3,MTechSoft,Faisalabad,Web Development,0
4,MTechSoft,Faisalabad,Angular Js,0


Train-test split

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['skills'], 
    df.company_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023
)

print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (1208,)
Shape of y_train:  (1208,)
Shape of X_test:  (302,)
Shape of y_test:  (302,)


Train KNN model

In [84]:
X_train.head()

334             Mobile Development
276              Software Products
862             Mobile Development
1506    In House Ui Ux Design Team
1273             Software Services
Name: skills, dtype: object

In [85]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_tfidf', v),    
    ('KNN', KNeighborsClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train.T, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test.T)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.25      0.33      0.29         3
           2       0.07      1.00      0.13         1
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           9       0.04      0.50      0.08         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         0
          21       0.00      0.00      0.00         2
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         0
          25       0.00      0.00      0.00         0
          26       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
import random

random_file = random.choice(paths)

text = textract.process(path)
text = text.decode(encoding)
# text preprocessing
text = remove_punctuation(text)
text = remove_newlines(text)
text = remove_numbers(text)
print(f'Document text becomes in this format: (Order of sentence does not matter)\n{text}')

pred = clf.predict([text])[0]

df[df['company_num'] == pred]

Document text becomes in this format: (Order of sentence does not matter)
University of Sargodha Department of Computer Science Information Technology Waqar younous EDUCATION QUALIFICATION University of Sargodha   MscIT CGPA   Superior college Pre Engineering Subjects Mathematics Physic Computer Bsc Subjects Double mathematics computer High school  Matriculation PROJECTS Research Project D games Environment setup using procedural approach Research on Electronic Games assets Android project Uos gpa cgpa app published on playstore Fraud catching app Object Oriented Analysis and Design Design document of Hospital Management System Database Systems Library Management System Design and Analysis of Algorithms Graph Coloring ACTIVITIES HONORS AWARDS Member Event Management Society University of Sargodha  SKILLS TOOLS Tools Android Studio Visual studio MS project Ms visio Anaconda pycharm Camtasia Studio Postman Unity Audacity character animator Media Encoder code blocks DBCA Packet Tracer O

Unnamed: 0,company_name,company_location,skills,company_num
250,Whizpool,Islamabad,Artifical Intelligence Machine Learning,76
251,Whizpool,Islamabad,Consulting,76
252,Whizpool,Islamabad,Data Sciences,76
253,Whizpool,Islamabad,E Learning,76
254,Whizpool,Islamabad,Human Resource Management Solution,76
255,Whizpool,Islamabad,Internet of Things IoT,76
256,Whizpool,Islamabad,Mobile Development,76
257,Whizpool,Islamabad,Software Products,76
258,Whizpool,Islamabad,Software Services,76
259,Whizpool,Islamabad,Web Development,76
