Convert all '.doc' files to '.docx' files (easier to read & parse) 

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/jobproject/
!ls

/content/drive/MyDrive/jobproject
 all_company_data.xlsx
'Graduating Directory of 7th Semester ( Graduate Book )'


In [3]:
!pip install textract > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.9.0 requires jedi>=0.10, which is not installed.[0m[31m
[0m

Create corpus

In [4]:
import string
import re
string.punctuation
#defining the function to remove punctuation
def remove_punctuation(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    return ' '.join(text.split())

def remove_newlines(text):
    return ' '.join(text.splitlines())

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [5]:
import textract
from glob import glob
import os
import sys

encoding = sys.getdefaultencoding()

paths = glob(os.getcwd() + '/Graduating Directory of 7th Semester ( Graduate Book )/**/*.docx', recursive=True)
corpus = []

showed_once = False
for path in paths:
    try:
        text = textract.process(path)
        text = text.decode(encoding)
        # text preprocessing
        text = remove_punctuation(text)
        text = remove_newlines(text)
        text = remove_numbers(text)
        if not showed_once:
            print(f'Document text becomes in this format: (Order of sentence does not matter)\n{text}')
            showed_once = True
        corpus.append(text)
    except:
        print(f'\nUnable to process file: {path}\n')

Document text becomes in this format: (Order of sentence does not matter)
BSSE  Date of Birth    Business Address CS IT Department University of Sargodha University Road Sargodha Pakistan Cell  Email aqsayaqoob gmail com University of Sargodha Department of Computer Science Information Technology Aqsa Yaqoob EDUCATION QUALIFICATION University of Sargodha   BSSE Honors CGPA   Air Base Inter College Mushaf Sargodha  ICS Subjects Computer Mathematics Physics Fazaia Model Inter College Mushaf Sargodha  Matriculation Computer PROJECTS Research Project Bug Bounty finding vulnerabilities in a web applcation Information Security Attacking and defending system Database Systems Rescue  Management System Web Sites Ecommerce website Truffles shop ACTIVITIES HONORS AWARDS Cheif of Air Staff Gold Medal  SKILLS TOOLS Tools Visual Studio NetBeans Adobe Illustrator Microsoft Office Wordpress Skills Web Development Udemy course Sql Injetions Udemy course Web Application Penetration Testing tools in Kali

Fit TF-IDF vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer(lowercase=False)
v.fit(corpus)

TfidfVectorizer(lowercase=False)

In [7]:
# Print the idf of each word:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

ABDEN : 4.49650756146648
ABDULLAH : 4.49650756146648
ACCP : 4.49650756146648
ACTIVITIES : 1.297834443915799
ADP : 4.49650756146648
ADPCS : 4.49650756146648
AES : 4.49650756146648
AI : 4.49650756146648
AIR : 4.49650756146648
AKHTAR : 4.49650756146648
ALI : 4.49650756146648
ANZA : 4.49650756146648
ARSLAN : 4.49650756146648
ARZOO : 4.49650756146648
ASAD : 4.49650756146648
ASHRAF : 4.49650756146648
ASIM : 4.49650756146648
ASP : 4.091042453358316
ATM : 3.803360380906535
AWARDS : 1.297834443915799
AWS : 4.49650756146648
AZIZ : 4.49650756146648
Aarfeen : 4.49650756146648
Abdul : 4.49650756146648
Abdullah : 4.49650756146648
Abid : 4.49650756146648
Abidin : 4.49650756146648
Academy : 4.091042453358316
Achievement : 4.49650756146648
Address : 1.0
Administration : 4.091042453358316
Admission : 3.580216829592325
Adobe : 1.238411023444998
Ads : 4.49650756146648
Adventure : 4.49650756146648
Ahmad : 4.49650756146648
Ahmed : 4.091042453358316
Ahsan : 4.49650756146648
Aima : 4.49650756146648
Aimen : 4.

Read dataset and clean + preprocess

In [8]:
import pandas as pd

#read the data into a pandas dataframe
df = pd.read_excel("all_company_data.xlsx")

df = df.fillna('')
df['skills'] = df['company_domain'] + ' ' + df['company_expertise']
df['skills'] = df['skills'].apply(remove_punctuation)
df = df.drop(labels=['web-scraper-order', 'web-scraper-start-url', 'pages', 'company_domain', 'company_expertise'], axis=1)

indices = list(range(df.shape[0]))
str_indices = [str(i) for i in indices]
df = df.set_index([pd.Index(str_indices)])

df.head(5)

Unnamed: 0,company_name,company_location,skills
0,MTechSoft,Faisalabad,eCommerce
1,MTechSoft,Faisalabad,Mobile Development
2,MTechSoft,Faisalabad,UI UX Design Creative
3,MTechSoft,Faisalabad,Web Development
4,MTechSoft,Faisalabad,Angular Js


Give each company an id (ML works with numbers only)

In [9]:
#Add the new column which gives a unique number to each of these labels 
companies = df['company_name'].unique()
companies = {companies[i]:i for i in range(len(companies))}

df['company_num'] = df['company_name'].map(companies)

#checking the results 
df.head(5)

Unnamed: 0,company_name,company_location,skills,company_num
0,MTechSoft,Faisalabad,eCommerce,0
1,MTechSoft,Faisalabad,Mobile Development,0
2,MTechSoft,Faisalabad,UI UX Design Creative,0
3,MTechSoft,Faisalabad,Web Development,0
4,MTechSoft,Faisalabad,Angular Js,0


Train-test split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['skills'], 
    df.company_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023
)

print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (1208,)
Shape of y_train:  (1208,)
Shape of X_test:  (302,)
Shape of y_test:  (302,)


Train KNN model

In [11]:
X_train.head()

334             Mobile Development
276              Software Products
862             Mobile Development
1506    In House Ui Ux Design Team
1273             Software Services
Name: skills, dtype: object

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_tfidf', v),    
    ('KNN', KNeighborsClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train.T, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test.T)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.02      0.33      0.04         3
           2       1.00      1.00      1.00         1
           3       0.01      0.33      0.02         3
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         2
          23       0.06      1.00      0.11         1
          24       0.00      0.00      0.00         0
          26       0.00      0.00      0.00         1
          28       0.00      0.00      0.00         2
          29       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
import random

random_file = random.choice(paths)

text = textract.process(path)
text = text.decode(encoding)
# text preprocessing
text = remove_punctuation(text)
text = remove_newlines(text)
text = remove_numbers(text)
print(f'Document text becomes in this format: (Order of sentence does not matter)\n{text}')

pred = clf.predict([text])[0]

df[df['company_num'] == pred]

Document text becomes in this format: (Order of sentence does not matter)
BSSE  University of Sargodha Department of Computer Science Information Technology Waseem Ullah EDUCATION QUALIFICATION University of Sargodha BSSE Honors CGPA     Punjab GROUP OF COLLEGES SGD ICs Subjects Mathematics Physics Computer Dar e Arqam SCHOOL SGD Matriculation Computer Science PROJECTS IoT Projecy Solar Tracking Sytem Web Engineering Blood Donation Database Systems Hostel Management System ACTIVITIES HONORS AWARDS Member Event Management Society University of Sargodha Cricket Match Sports Gala Trophy SKILLS TOOLS Tools Figma Adobe XD Photoshop Adobe Premier Pro   Skills UI UX Designing Logo Creation Web Design Languages C HTML CSS INTERESTS Reading playing cricket volunteering in university events Date of Birth    Business Address CS IT Department University of Sargodha University Road Sargodha Pakistan Tel  Cell   Email waseemullahg g mail com


Unnamed: 0,company_name,company_location,skills,company_num
20,TEO (Pvt.) Ltd.,Islamabad,Artifical Intelligence Machine Learning,3
21,TEO (Pvt.) Ltd.,Islamabad,Consulting,3
22,TEO (Pvt.) Ltd.,Islamabad,eCommerce,3
23,TEO (Pvt.) Ltd.,Islamabad,Internet of Things IoT,3
24,TEO (Pvt.) Ltd.,Islamabad,Mobile Development,3
25,TEO (Pvt.) Ltd.,Islamabad,Sales Marketing,3
26,TEO (Pvt.) Ltd.,Islamabad,Software Products,3
27,TEO (Pvt.) Ltd.,Islamabad,Software Services,3
28,TEO (Pvt.) Ltd.,Islamabad,UI UX Design Creative,3
29,TEO (Pvt.) Ltd.,Islamabad,Web Development,3
