In [1]:
import numpy as np
import pandas as pd
#import tensorflow as tf
#from tensorflow import keras

## Data Import And Exploration

In [2]:
#Importing the data
df = pd.read_excel('Data Set - industrial_safety_and_health_database_with_accidents_description.xlsx')

In [3]:
#Checking the shape
df.shape

(425, 11)

Dataset has 425 records and 11 columns.

In [4]:
#Checking the info about columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0                425 non-null    int64         
 1   Data                      425 non-null    datetime64[ns]
 2   Countries                 425 non-null    object        
 3   Local                     425 non-null    object        
 4   Industry Sector           425 non-null    object        
 5   Accident Level            425 non-null    object        
 6   Potential Accident Level  425 non-null    object        
 7   Genre                     425 non-null    object        
 8   Employee or Third Party   425 non-null    object        
 9   Critical Risk             425 non-null    object        
 10  Description               425 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 36.6+ KB


* There is one unnamed column, so we can remove that column.
* All the columns are categorical except Data and Unnamed column.

In [5]:
#Checking the first 5 records
df.head()

Unnamed: 0.1,Unnamed: 0,Data,Countries,Local,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee or Third Party,Critical Risk,Description
0,0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...
1,1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...
2,2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...
3,3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...
4,4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...


# Data Cleaning

In [6]:
#Dropping the Unnamed columns
df = df.drop('Unnamed: 0', axis=1)

In [7]:
#Checking the duplicate values
df.duplicated().sum()

7

There are 7 duplicate records.

In [8]:
#Checking the uplicate records
df[df.duplicated() == True]

Unnamed: 0,Data,Countries,Local,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee or Third Party,Critical Risk,Description
77,2016-04-01,Country_01,Local_01,Mining,I,V,Male,Third Party (Remote),Others,In circumstances that two workers of the Abrat...
262,2016-12-01,Country_01,Local_03,Mining,I,IV,Male,Employee,Others,During the activity of chuteo of ore in hopper...
303,2017-01-21,Country_02,Local_02,Mining,I,I,Male,Third Party (Remote),Others,Employees engaged in the removal of material f...
345,2017-03-02,Country_03,Local_10,Others,I,I,Male,Third Party,Venomous Animals,On 02/03/17 during the soil sampling in the re...
346,2017-03-02,Country_03,Local_10,Others,I,I,Male,Third Party,Venomous Animals,On 02/03/17 during the soil sampling in the re...
355,2017-03-15,Country_03,Local_10,Others,I,I,Male,Third Party,Venomous Animals,Team of the VMS Project performed soil collect...
397,2017-05-23,Country_01,Local_04,Mining,I,IV,Male,Third Party,Projection of fragments,In moments when the 02 collaborators carried o...


In [9]:
#Removing the duplicate records
df.drop_duplicates(inplace=True)

In [10]:
#Renaming the column names
df.rename(columns={'Data' : 'Date', 'Countries':'Country','Genre':'Gender','Employee or Third Party':'Employee Type'},inplace= True)

In [11]:
#Checking for the missing values
df.isnull().sum()

Date                        0
Country                     0
Local                       0
Industry Sector             0
Accident Level              0
Potential Accident Level    0
Gender                      0
Employee Type               0
Critical Risk               0
Description                 0
dtype: int64

There are no missing values in the dataset.

In [12]:
#Deriving new columns from Date column - Year, Month, day, Dayname
df['Year'] = df['Date'].apply(lambda x:x.year)
df['Day'] = df['Date'].apply(lambda x:x.day)
df['Month'] = df['Date'].apply(lambda x:x.month)
df['Day_name'] = df['Date'].apply(lambda x:x.day_name())

In [13]:
df.head()

Unnamed: 0,Date,Country,Local,Industry Sector,Accident Level,Potential Accident Level,Gender,Employee Type,Critical Risk,Description,Year,Day,Month,Day_name
0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...,2016,1,1,Friday
1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...,2016,2,1,Saturday
2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...,2016,6,1,Wednesday
3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...,2016,8,1,Friday
4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...,2016,10,1,Sunday


In [14]:
#Replacing Accident Level VI with V
df['Accident Level'] = df['Accident Level'].replace('VI','V')
df['Potential Accident Level'] = df['Potential Accident Level'].replace('VI','V')

# NLP Data Preprocessing

In [15]:
#Feature and Target variable
x = df['Description']
y = df['Accident Level']

In [16]:
import unidecode
def rmv_uni(sen):
  wrds = sen.split()
  new_sen = [unidecode.unidecode(w) for w in wrds]
  new_sen = ' '.join(new_sen)

  return new_sen

In [17]:
x = x.apply(lambda x: rmv_uni(x))

In [18]:
def rmv_schar(s):
  wrds = s.split()
  new_text = [w for w in wrds if w.isalnum()]
  new_text = ' '.join(new_text)

  return new_text

In [19]:
x = x.apply(lambda x: rmv_schar(x))

In [20]:
def lower_case(s):
  s = s.lower()
  return s

In [21]:
x = x.apply(lambda x: lower_case(x))

In [22]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Mrinal
[nltk_data]     Kalita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
def rmv_stpwrds(sen):
  wrds = sen.split()
  new_text = [w for w in wrds if w not in stopwords.words('english')]
  new_text = ' '.join(new_text)

  return new_text

In [24]:
x= x.apply(lambda x: rmv_stpwrds(x))

In [25]:
def strp(sen):
  sen = sen.strip()
  return sen

In [26]:
x = x.apply(lambda x: strp(x))

In [27]:
#Adding preprocessed description in the dataframe
df['Cleaned_description'] = x

In [28]:
df = df[['Cleaned_description','Accident Level']]

In [29]:
df.head()

Unnamed: 0,Cleaned_description,Accident Level
0,removing drill rod jumbo 08 supervisor proceed...,I
1,activation sodium sulphide piping uncoupled su...,I
2,milpo located level collaborator excavation wo...,I
3,approximately 1880 personnel begins task unloc...,I
4,approximately circumstances mechanics anthony ...,IV


In [30]:
df['Accident Level'] = df['Accident Level'].replace(['I','II','III','IV','V'],[1,2,3,4,5]) 

In [40]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

d = {'train':Dataset.from_dict({'label':df['Accident Level'][0:339],'text':df['Cleaned_description'][0:339]}),
     'val':Dataset.from_dict({'label':df['Accident Level'][340:],'text':df['Cleaned_description'][340:]}),
     }

d = DatasetDict(d)

In [41]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
#def tokenize_function(examples):
 #  return tokenizer(examples, padding="max_length", truncation=True)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = d.map(tokenize_function, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████| 339/339 [00:00<00:00, 368.54 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 78/78 [00:00<00:00, 275.77 examples/s]


In [42]:
small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["val"]

In [43]:
import torch
#print(torch.cuda.is_available())
#print(torch.cuda.get_device_name(0))

In [60]:
from transformers import GPT2ForSequenceClassification
#from tensorflow import keras

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=6)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [61]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,

)

trainer.train()

Step,Training Loss


TrainOutput(global_step=252, training_loss=0.9317533704969618, metrics={'train_runtime': 11697.8913, 'train_samples_per_second': 0.087, 'train_steps_per_second': 0.022, 'total_flos': 526793274556416.0, 'train_loss': 0.9317533704969618, 'epoch': 2.97})

In [62]:
import evaluate

trainer.evaluate()

{'eval_loss': 0.9067965745925903,
 'eval_accuracy': 0.7435897435897436,
 'eval_runtime': 355.3761,
 'eval_samples_per_second': 0.219,
 'eval_steps_per_second': 0.219,
 'epoch': 2.97}

In [63]:
import pickle
pickle.dump(model,open('model.pkl','wb'))

In [4]:
import pickle
mod = pickle.load(open('C:/Users/Mrinal Kalita/Python Projects/CApstone Project - Chatbot/model.pkl', 'rb'))

  from .autonotebook import tqdm as notebook_tqdm
