In [16]:
import pandas as pd 
import numpy as np

def readFromFile(file_path):
    """
    Reads data from a file with the given path and returns a data frame that contains the records from the path
    file_path: a file's path
    return: data frame
    """
    dataFrame = pd.read_csv(file_path, delimiter=',', header='infer')
    return dataFrame


In [17]:
def numberOfEmployees(employees):
    """
    Returns the number of employees in a data frame
    employees: data frame
    return: number of employees
    """
    return employees.shape[0]

In [18]:
def numberAndTypesOfFields(employees):
    """
    Returns the number of fields that an employee has and the type for each field
    employees: data frame
    """
    numberOfFields = employees.shape[1]
    types = employees.dtypes
    return numberOfFields, types


In [19]:
def numberOfEmployeesThatHaveAllDatasCompleted(employees):
    """
    Returns the number of employees that have all fields completed
    employees: data frame
    """
    employeesThatHaveFullData = employees.dropna()
    return employeesThatHaveFullData.shape[0]

In [20]:
def MinMaxAverageValuesForNumericFields(employees):
    """
    Returns the min, max and average value for each numeric field
    employees: data frame
    """
    minValues = employees.min(numeric_only=True)
    maxValues = employees.max(numeric_only=True)
    averageValues = employees.mean(numeric_only=True)
    return minValues, maxValues, averageValues

In [21]:
def numberOfPossibleValuesForNonNumericProperties(employees):
    """
    Returns the number of possible values for each non-numeric property in the given data frame
    employees: data frame
    """
    return employees.select_dtypes(exclude=np.number).nunique()
    

In [22]:
def hasEmptyValues(employees):
    """
    Returns True/False
    True - if there are rows that have empty properties
    False - otherwise
    """
    if numberOfEmployees(employees) == numberOfEmployeesThatHaveAllDatasCompleted(employees):
        return False
    else:
        return True
    
def replaceEmptyValues(employees):
    """
    Replace empty values with the average value of that property
    return: modified data frame
    """
    for property in employees.head(0):
        employees[property] = employees[property].fillna(employees[property].mode()[0])
    
    return employees;


In [23]:
from matplotlib import pyplot as plt

def salaryCategories(employees):
    """
    Histogram that displays the salary distribution of the employees by salary category
    """
    plt.hist(employees.Salary, 15, rwidth=0.8)
    plt.show()

In [24]:
def salaryandTeamCategories(employees):
    """
    Histogram that displays the salary distribution of the employees based on the salary and the team
    """
    salaryAndTeam = employees[["Salary", "Team"]].groupby("Team")
    numberOfTeams = len(employees.dropna()["Team"].unique())

    fig, axes = plt.subplots(numberOfTeams, 1, figsize=(10, 40))
    i = 0
    for team, salaryTeam in salaryAndTeam:
        salaryTeam["Salary"].plot(kind="hist", ax=axes[i])
        axes[i].set_title(team)
        i += 1
    
    plt.show()



In [25]:
def outlierEmployees(employees):
    """
    Outlier employees
    """
    qmin = employees['Salary'].quantile(0.01) 
    qmax = employees['Salary'].quantile(0.99)
    return employees[~employees.Salary.between(qmin, qmax)]

In [26]:
def tests(employees):
    #problema1a
    assert(numberOfEmployees(employees) == 1000)

    assert(numberAndTypesOfFields(employees)[0] == 8)

    assert(numberOfEmployeesThatHaveAllDatasCompleted(employees) == 764)
    
    assert(MinMaxAverageValuesForNumericFields(employees)[0].iloc[0] == 35013.0)
    assert(MinMaxAverageValuesForNumericFields(employees)[0].iloc[1] == 1.015)
    assert(MinMaxAverageValuesForNumericFields(employees)[1].iloc[0] == 149908.0)
    assert(MinMaxAverageValuesForNumericFields(employees)[1].iloc[1] == 19.944)
    assert(MinMaxAverageValuesForNumericFields(employees)[2].iloc[0] == 90662.181)
    assert(MinMaxAverageValuesForNumericFields(employees)[2].iloc[1] == 10.207555000000001)

    assert(numberOfPossibleValuesForNonNumericProperties(employees).iloc[0] == 200)
    assert(numberOfPossibleValuesForNonNumericProperties(employees).iloc[1] == 2)
    assert(numberOfPossibleValuesForNonNumericProperties(employees).iloc[2] == 972)
    assert(numberOfPossibleValuesForNonNumericProperties(employees).iloc[3] == 720)
    assert(numberOfPossibleValuesForNonNumericProperties(employees).iloc[4] == 2)
    assert(numberOfPossibleValuesForNonNumericProperties(employees).iloc[5] == 10)

    assert(hasEmptyValues(employees) == True)
    

In [27]:
def main():
    employees = readFromFile('employees.csv')
    tests(employees)
    # display(employees)

    #PROBLEMA 1A
    #a)
    # print(numberOfEmployees(employees))

    #b)
    # nr, types = numberAndTypesOfFields(employees)
    # print(nr, '\n', types)

    #c)
    # print(numberOfEmployeesThatHaveAllDatasCompleted(employees))

    #d)
    # minValues, maxValues, averageValues = MinMaxAverageValuesForNumericFields(employees)
    # print(minValues)
    # print(maxValues)
    # print(averageValues)

    #e)
    # print(numberOfPossibleValuesForNonNumericProperties(employees))

    #f)
    # print(hasEmptyValues(employees))
    # display(replaceEmptyValues(employees))


    #PROBLEMA 1B
    #a)
    # salaryCategories(employees)

    #b)
    # salaryandTeamCategories(employees)

    #c)
    # display(outlierEmployees(employees))
    

main()