# **VERİ ANALİZİ**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)

In [2]:
import pandas as pd

# Dosyanın tam yolunu belirtin
df = pd.read_csv("C:\\Users\\Win10\\Desktop\\HR-EmployeeAttrition.csv")

# İlk birkaç satırı görüntüleyin
print(df.head())


   Age     BusinessTravel              Department  DistanceFromHome  Education EducationField  EnvironmentSatisfaction  Gender  JobInvolvement  JobLevel                JobRole  JobSatisfaction MaritalStatus  MonthlyIncome OverTime  PerformanceRating  RelationshipSatisfaction  StockOptionLevel  TotalWorkingYears  TrainingTimesLastYear  WorkLifeBalance  YearsAtCompany  YearsInCurrentRole  YearsSinceLastPromotion  YearsWithCurrManager Attrition
0   30      Travel_Rarely  Research & Development                 1          2        Medical                        4    Male               3         1  Laboratory Technician                2       Married           3748       No                  3                         3                 0                 12                      6                2              12                   8                        1                     7        No
1   48  Travel_Frequently  Research & Development                 4          5        Medical               

In [3]:
# Keşifçi Veri Analizi (EDA)
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)

    print("##################### Types #####################")
    print(dataframe.dtypes)

    print("##################### Head #####################")
    print(dataframe.head(head))

    print("##################### Tail #####################")
    print(dataframe.tail(head))

    print("##################### NA #####################")
    print(dataframe.isnull().sum())

    print("##################### Describe #####################")
    print(dataframe.describe().T)

# Kategorik ve Sayısal Değişkenlerin Belirlenmesi

def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]

    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

# Kategorik Değişken Analizi

def cat_summary_plotly(dataframe, col_name):
    summary_df = dataframe[col_name].value_counts().reset_index()
    summary_df.columns = ['value', 'count']
    summary_df['percentage'] = 100 * summary_df['count'] / len(dataframe)
    summary_df['percentage'] = summary_df['percentage'].round(2)  # 2 decimal places

    fig = px.bar(summary_df, x='value', y='count', text='percentage',
                 title=f'{col_name} Count and Percentage',
                 labels={'value': col_name, 'count': 'Count', 'percentage': 'Percentage'})
    fig.show()

# Sayısal Değişken Analizi

def num_summary_plotly(dataframe, numerical_col):
    fig = px.histogram(dataframe, x=numerical_col, nbins=30,
                       title=f'{numerical_col} Distribution')
    fig.show()

    fig = px.box(dataframe, y=numerical_col, title=f'{numerical_col} Box Plot')
    fig.show()

# Hedef Değişken Analizi (Kategorik)

def target_summary_with_cat_plotly(dataframe, target, categorical_col):
    summary_df = dataframe.groupby(categorical_col)[target].mean().reset_index()

    fig = px.bar(summary_df, x=categorical_col, y=target,
                 title=f'{categorical_col} vs. {target}',
                 labels={categorical_col: categorical_col, target: target})
    fig.show()

# Hedef Değişken Analizi (Sayısal)

def target_summary_with_num_plotly(dataframe, target, numerical_col):
    summary_df = dataframe.groupby(target)[numerical_col].mean().reset_index()

    fig = px.bar(summary_df, x=target, y=numerical_col,
                 title=f'{target} vs. Mean of {numerical_col}',
                 labels={target: target, numerical_col: 'Mean of ' + numerical_col})
    fig.show()

def correlation_heatmap(dataframe):
    corr_matrix = dataframe.corr()

    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.index.values,
        y=corr_matrix.columns.values,
        colorscale='Viridis',
        colorbar=dict(title='Correlation'),
    ))

    fig.update_layout(
        title='Correlation Heatmap',
        xaxis=dict(title='Features'),
        yaxis=dict(title='Features')
    )

    fig.show()

# Eksik veya Açıklanmayan Verilerin İncelenmesi
def check_missing_data_plotly(dataframe):
    missing_data = dataframe.isnull().sum().sort_values(ascending=False)
    missing_data = missing_data[missing_data > 0]
    if missing_data.empty:
        print("Veri setinde eksik veya açıklanmayan veri yok.")
    else:
        print("Eksik veya açıklanmayan verilerin sayısı ve oranları:\n", missing_data)
        # Eksik verilerin oranlarını da görselleştirebiliriz
        fig = px.bar(x=missing_data.index, y=missing_data.values, labels={'x': 'Değişkenler', 'y': 'Eksik Veri Sayısı'},
                     title='Eksik Veri Sayısı')
        fig.show()
        
# Aykırı Değer Analizi ve IQR yöntemi ile tespit
def outlier_analysis_iqr_plotly(dataframe, numerical_cols):
    outlier_dict = {}

    for col in numerical_cols:
        Q1 = dataframe[col].quantile(0.25)
        Q3 = dataframe[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = dataframe[(dataframe[col] < lower_bound) | (dataframe[col] > upper_bound)]
        outlier_dict[col] = outliers

        # Boxplot oluşturma
        fig = px.box(dataframe, y=col, title=f'{col} Aykırı Değer Analizi (IQR Method)')
        fig.add_shape(type="line", x0=lower_bound, y0=lower_bound, x1=lower_bound, y1=upper_bound, line=dict(color="red", width=2))
        fig.add_shape(type="line", x0=upper_bound, y0=lower_bound, x1=upper_bound, y1=upper_bound, line=dict(color="red", width=2))
        fig.show()

    return outlier_dict

In [4]:
check_df(df)

##################### Shape #####################
(1323, 26)
##################### Types #####################
Age                          int64
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EnvironmentSatisfaction      int64
Gender                      object
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
OverTime                    object
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
Attrition     

# **Kategorik ve sayısal değişkenlerin belirlenmesi**

In [5]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)
print(f"Kategorik Değişkenler: {cat_cols}")
print(f"Sayısal Değişkenler: {num_cols}")     
print(f"Kategorik Ancak Kardinal Değişkenler: {cat_but_car}")


Observations: 1323
Variables: 26
cat_cols: 18
num_cols: 8
cat_but_car: 0
num_but_cat: 10
Kategorik Değişkenler: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime', 'Attrition', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance']
Sayısal Değişkenler: ['Age', 'DistanceFromHome', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Kategorik Ancak Kardinal Değişkenler: []


In [6]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

cat_cols.remove("PerformanceRating")
cat_cols.remove("Attrition")

Observations: 1323
Variables: 26
cat_cols: 18
num_cols: 8
cat_but_car: 0
num_but_cat: 10


In [7]:
print(f"Kategorik Değişkenler: {cat_cols}")
print(f"Sayısal Değişkenler: {num_cols}")     
print(f"Kategorik Ancak Kardinal Değişkenler: {cat_but_car}")




Kategorik Değişkenler: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance']
Sayısal Değişkenler: ['Age', 'DistanceFromHome', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Kategorik Ancak Kardinal Değişkenler: []


# **Kategorik Değişken Analizi**


In [8]:
for col in cat_cols:
    cat_summary_plotly(df, col)

# **Sayısal Değişken Analizi**


In [9]:
for col in num_cols:
    num_summary_plotly(df, col)

# **1.Hedef Değişken Analizi (Kategorik) (Attrition)**

In [10]:
df['Attrition'] = df['Attrition'].apply(lambda x: 1 if x == "Yes" else 0)
for col in cat_cols:
    target_summary_with_cat_plotly(df,"Attrition", col)
    


# **1.Hedef Değişken Analizi (Sayısal) (Attrition) **


In [11]:
for col in num_cols:
    target_summary_with_num_plotly(df, "Attrition", col)

# **2.Hedef Değişken Analizi (Kategorik) (PerformanceRaiting)**

In [14]:
#df[''] = df['PerformanceRating'].apply(lambda x: 1 if x == "Yes" else 0)
for col in cat_cols:
    target_summary_with_cat_plotly(df, "PerformanceRating", col) 

# **2.Hedef Değişken Analizi (Sayısal) (PerformanceRaiting) **

In [15]:
for col in num_cols:
    target_summary_with_num_plotly(df, "PerformanceRating", col)

# **Korealasyon Analizi**

In [16]:
numerical_df = df[num_cols]
correlation_heatmap(numerical_df)

# **Eksik Verilerin İncelenmesi**

In [17]:
check_missing_data_plotly(df)

Veri setinde eksik veya açıklanmayan veri yok.


# **Aykırı Değer Analizi**

In [18]:
# Aykırı değerleri bul, ayır ve grafiği göster
outliers_dict = outlier_analysis_iqr_plotly(df, num_cols)

In [19]:
# Her bir sayısal değişken için aykırı değerleri içeren DataFrame'i görüntüle
for col, outliers_df in outliers_dict.items():
    print(f"{col} Aykırı Değerler: {len(outliers_df)}")
    print(outliers_df.head())
    print("\n")

Age Aykırı Değerler: 0
Empty DataFrame
Columns: [Age, BusinessTravel, Department, DistanceFromHome, Education, EducationField, EnvironmentSatisfaction, Gender, JobInvolvement, JobLevel, JobRole, JobSatisfaction, MaritalStatus, MonthlyIncome, OverTime, PerformanceRating, RelationshipSatisfaction, StockOptionLevel, TotalWorkingYears, TrainingTimesLastYear, WorkLifeBalance, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager, Attrition]
Index: []


DistanceFromHome Aykırı Değerler: 0
Empty DataFrame
Columns: [Age, BusinessTravel, Department, DistanceFromHome, Education, EducationField, EnvironmentSatisfaction, Gender, JobInvolvement, JobLevel, JobRole, JobSatisfaction, MaritalStatus, MonthlyIncome, OverTime, PerformanceRating, RelationshipSatisfaction, StockOptionLevel, TotalWorkingYears, TrainingTimesLastYear, WorkLifeBalance, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager, Attrition]
Index: []


MonthlyIncome Aykırı Değerl