# Analyze employee feedback surveys to identify areas of improvement

***Importing nesseccesary libraries***

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns


***Reading CSV dataset file***

In [16]:
df  = pd.read_csv(r"C:\Users\ktand\Downloads\archive\employee_review_mturk_dataset_v10_kaggle.csv")
df

Unnamed: 0,id,person_name,nine_box_category,feedback,adjusted,reviewed
0,1,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has not progressed in his position. He is...,False,True
1,2,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has consistently disappointed me this qua...,False,True
2,3,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John turned in subpar work product all quarter...,False,True
3,6,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John Doe demonstrates a low level of knowledge...,False,True
4,7,George Gill,"Category 1: 'Risk' (Low performance, Low poten...",George gill's performance is really poor. He d...,False,True
...,...,...,...,...,...,...
873,10205,Bailey Hunt,"Category 9: 'Star' (High performance, High pot...",No one performs like Bailey. I believe she wil...,False,False
874,10226,Thaddeus Burgess,"Category 9: 'Star' (High performance, High pot...",Thaddeus Burgess is a constant force within th...,True,True
875,20022,Max Miller,"Category 9: 'Star' (High performance, High pot...",Max Miller is a a great coworker. He is dili...,True,True
876,20023,Allan Logan,"Category 9: 'Star' (High performance, High pot...","Allan Logan, Excellent performer absolutely bl...",True,True


***Import Warning to avoid unneccessary warnings***

In [17]:
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")
warnings.filterwarnings("ignore", message="When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group.*")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
df.head()  #getting overviw of first 5 lines of the dataset using head() function

Unnamed: 0,id,person_name,nine_box_category,feedback,adjusted,reviewed
0,1,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has not progressed in his position. He is...,False,True
1,2,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has consistently disappointed me this qua...,False,True
2,3,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John turned in subpar work product all quarter...,False,True
3,6,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John Doe demonstrates a low level of knowledge...,False,True
4,7,George Gill,"Category 1: 'Risk' (Low performance, Low poten...",George gill's performance is really poor. He d...,False,True


***Basic statistics***

In [6]:
df.describe() #to get the descrivtive stats of the dataframe


Unnamed: 0,id
count,878.0
mean,3235.269932
std,4956.401144
min,1.0
25%,237.25
50%,473.5
75%,10019.75
max,20222.0


***Exploring The Data***

In [7]:
print("Data Shape:", df.shape,'\n')
print("Data Columns:", df.columns,'\n')
print("Data Types:\n", df.dtypes,'\n')
print("Missing Values:\n", df.isnull().sum(),'\n')

Data Shape: (878, 6) 

Data Columns: Index(['id', 'person_name', 'nine_box_category', 'feedback', 'adjusted',
       'reviewed'],
      dtype='object') 

Data Types:
 id                    int64
person_name          object
nine_box_category    object
feedback             object
adjusted               bool
reviewed               bool
dtype: object 

Missing Values:
 id                   0
person_name          0
nine_box_category    0
feedback             0
adjusted             0
reviewed             0
dtype: int64 



***Data Cleaning***

***Replace null values***

In [8]:
df = df.replace("s","NA")
df

Unnamed: 0,id,person_name,nine_box_category,feedback,adjusted,reviewed
0,1,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has not progressed in his position. He is...,False,True
1,2,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has consistently disappointed me this qua...,False,True
2,3,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John turned in subpar work product all quarter...,False,True
3,6,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John Doe demonstrates a low level of knowledge...,False,True
4,7,George Gill,"Category 1: 'Risk' (Low performance, Low poten...",George gill's performance is really poor. He d...,False,True
...,...,...,...,...,...,...
873,10205,Bailey Hunt,"Category 9: 'Star' (High performance, High pot...",No one performs like Bailey. I believe she wil...,False,False
874,10226,Thaddeus Burgess,"Category 9: 'Star' (High performance, High pot...",Thaddeus Burgess is a constant force within th...,True,True
875,20022,Max Miller,"Category 9: 'Star' (High performance, High pot...",Max Miller is a a great coworker. He is dili...,True,True
876,20023,Allan Logan,"Category 9: 'Star' (High performance, High pot...","Allan Logan, Excellent performer absolutely bl...",True,True


***Removing missing values***

In [9]:
df = df.dropna()

In [10]:
df = df.head()
df

Unnamed: 0,id,person_name,nine_box_category,feedback,adjusted,reviewed
0,1,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has not progressed in his position. He is...,False,True
1,2,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John has consistently disappointed me this qua...,False,True
2,3,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John turned in subpar work product all quarter...,False,True
3,6,John Doe,"Category 1: 'Risk' (Low performance, Low poten...",John Doe demonstrates a low level of knowledge...,False,True
4,7,George Gill,"Category 1: 'Risk' (Low performance, Low poten...",George gill's performance is really poor. He d...,False,True


# EDA : Exploratory Data Analysis

***Removing duplicates***

In [11]:
df = df.duplicated()
df

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.info()

<class 'pandas.core.series.Series'>
Index: 1 entries, 0 to 0
Series name: None
Non-Null Count  Dtype
--------------  -----
1 non-null      bool 
dtypes: bool(1)
memory usage: 9.0 bytes


***Checking Missing Values***

In [14]:
df = df.copy()

df = df.isnull().sum()
df

0

***Data Visualisation***

In [18]:
continuous_features = ['QTY', 'VALUE', 'PRICE']
for feature in continuous_features:
    sns.kdeplot(df[feature], shade=True, label=feature)
plt.title('KDE Plot of Continuous Features')
plt.xlabel('Value')
plt.ylabel('Density')
plt.xlim(0, 500)
plt.legend()
plt.show()

NameError: name 'feedback' is not defined

In [None]:
from collections import Counter

word_tokens = df['review_text'].apply(word_tokenize)
word_freq = Counter(word for tokens in word_tokens for word in tokens)

plt.barh(range(10), [freq for word, freq in word_freq.most_common(10)])
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title('Top 10 Most Common Words')
plt.show()

In [None]:
review_lengths = df['review_text'].apply(len)
plt.hist(review_lengths, bins=50)
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Distribution of Review Lengths')
plt.show()