# Resume named entities CSV
https://www.kaggle.com/kerneler/starter-resume-named-entities-csv-cbe274fc-7

In [7]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [8]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [10]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [11]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [12]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [32]:
df = pd.read_csv('./data/Entity.tsv', sep='\t', header=None)
print(df.shape)
df.head()

(5329923, 2)


Unnamed: 0,0,1
0,Abhishek,Name
1,Jha,Name
2,Application,Designation
3,Development,Designation
4,Associate,Designation


In [33]:
df.columns = ['cv_info', 'category']
df.head()

Unnamed: 0,cv_info,category
0,Abhishek,Name
1,Jha,Name
2,Application,Designation
3,Development,Designation
4,Associate,Designation


In [34]:
df.category.value_counts()

O                           4657718
Skills                       248920
Companies worked at          105858
Designation                   96406
Degree                        49535
College Name                  48086
Years of Experience           33667
Location                      29169
Name                          21719
Email Address                 17390
Graduation Year                9186
College                        2401
UNKNOWN                        2374
Rewards and Achievements       2262
Can Relocate to                1605
Address                         997
Links                           988
projects                        544
University                      403
Relocate to                     271
Certifications                  270
training                         88
state                            35
links                            26
des                               2
work experience                   2
abc                               1
Name: category, dtype: int64

In [35]:
df[df.category == 'Years of Experience']

Unnamed: 0,cv_info,category
416,3.2,Years of Experience
417,years,Years of Experience
418,of,Years of Experience
419,experience,Years of Experience
506,January,Years of Experience
...,...,...
5307939,Present,Years of Experience
5307986,15+,Years of Experience
5307987,Yrs.,Years of Experience
5329048,7+,Years of Experience


In [44]:
df[df.category == 'Companies worked at'].cv_info.value_counts()

Ltd          8130
Microsoft    3763
Limited      3040
Oracle       2918
India        2864
             ... 
THOUSNAD        1
Emaar           1
PAVIZHAM        1
MANTRA          1
te              1
Name: cv_info, Length: 1832, dtype: int64