In [1]:
import pandas as pd, numpy as np

In [2]:
d = pd.read_csv(r"C:\Users\master\Downloads\Iris.csv")
d

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


### Write a class which takes the data set as argument and write methods for the following statistical measures without using built-in functions:

In [67]:
class Statistics:
    def __init__(self, path):
        self.df = pd.read_csv(path)

        
    def arithmetic_mean(self, column_name):
        '''
        sum of all the values in the specified column of the DataFrame.
        number of elements (or rows) in the specified column of the DataFrame.
        Calculation of Mean:
        The sum of the values in the column is divided by the number of elements in the column to get the arithmetic mean
        '''
        
        if column_name not in self.df.columns:
            return None 
        else: 
            mean = sum(self.df['SepalLengthCm']) / len(self.df['SepalLengthCm'])
            return mean
    
    
    def geometric_mean(self, column_name):
        product = 1
        '''
        Multiplies the current value of product by the current element (num) in the column. This is done iteratively for each element in the column
        Calculates the geometric mean by raising the product to the power of the reciprocal of the number of elements in the column. The reciprocal is calculated
        '''
        for num in self.df[column_name]:
            product *= num
        return product ** (1.0 / len(self.df[column_name]))
    
    def harmonic_mean(self, column_name):
        '''
        The sum of all the reciprocal of each value (num) in the specified column (column_name) of the DataFrame
        The number of elements in the specified column is divided by the sum of the reciprocals.
        '''
        return len(self.df[column_name]) / sum(1.0 / num for num in self.df[column_name])
    
    def mode(self, column_name):
        '''
        A for loop is used to iterate over each number (num) in the specified column of the DataFrame. 
        For each num, it checks if num is not already a key in the freq dictionary. If it’s not, it adds num as a key to the dictionary with a value of 1. 
        If num is already a key in the dictionary, it increments the value of that key by 1.
        After the loop, the freq dictionary contains the frequency of each number in the column. 
        The mode is then calculated by finding the key with the maximum value in the freq dictionary.
        '''
        freq = {}
        for num in self.df[column_name]:
            if num not in freq:
                freq[num] = 1
            else:
                freq[num] += 1
        mode_value = max(freq, key=freq.get)
        return mode_value
    
    def median(self, column_name):
        ''' 
        midpoint (mid) of the length of the specified column (column_name).
        the // operator performs integer division, ensuring that the result is an integer.
        
        median is computed as the average of these two middle values by adding them and dividing by 2.
        '''
        mid = len(self.df[column_name]) // 2
        return (self.df[column_name].iloc[mid] + self.df[column_name].iloc[~mid]) / 2

    def variance(self, column_name):
        '''
        The mean of the specified column is calculated using the arithmetic_mean method of the class. This method should return the average of the values in the column.
        The variance is then calculated by summing the squared differences between each number (num) in the column and the mean, and then dividing by the number of elements in the column. 
        This is done using a generator expression inside the sum function.
        '''
        mean = self.arithmetic_mean(column_name)
        variance = sum((num - mean) ** 2 for num in self.df[column_name]) / len(self.df[column_name])
        return variance
    
    def standard_deviation(self, column_name):
        '''
        it’s being used here to take the square root of the variance. 
        The square root of the variance gives us the standard deviation.
        '''
        return self.variance(column_name) ** 0.5
    
    def max(self, column_name):
        '''
        find the maximum value in the specified column (column_name) of the DataFrame
        '''
        return max(self.df[column_name])

    def min(self, column_name):
        '''
        find the minimum value in the specified column (column_name) of the DataFrame
        '''
        return min(self.df[column_name])
    
    def range(self, column_name):
        '''
        calculates the maximum & minimum value of the data in the specified column.
        calculates the range by subtracting the minimum value from the maximum value.
        '''
        max_value = max(self.df[column_name])
        min_value = min(self.df[column_name])

        range_value = max_value - min_value
        return range_value
    
    def iqr(self, column_name):
        values = list(self.df[column_name])
        n = len(values)
        for i in range(n):
            for j in range(0, n - i - 1):
                if values[j] > values[j + 1]:
                    values[j], values[j + 1] = values[j + 1], values[j]
        '''
        calculate the indices for the first quartile (Q1) and the third quartile (Q3). 
        In this case, it’s using the simplest method of quartile calculation, where Q1 is the value at 25% of the ordered data and Q3 is the value at 75% of the ordered data.
        '''
        q1_index = n // 4
        q3_index = n * 3 // 4

        q1 = values[q1_index]
        q3 = values[q3_index]

        return q3 - q1   # calculates the IQR by subtracting Q1 from Q3
    
    def quartiles(self, column_name):
        '''
        calculate the indices for the first quartile (Q1), the second quartile (Q2, also known as the median) and the third quartile (Q3). 
        The // operator performs integer (floor) division
        '''
        column = self.df[column_name]
        q1_index = len(column) // 4
        q2_index = len(column) // 2
        q3_index = len(column) * 3 // 4
        
        q1 = column[q1_index]
        q2 = column[q2_index]
        q3 = column[q3_index]

        return q1, q2, q3
    
    def coefficient_of_range(self, column_name):
        '''
        finds the maximum and minimum value in the column specified by column_name in the DataFrame
        calculates the range of the column, which is the difference between the maximum and minimum values
        calculates the coefficient of range, which is the ratio of the range to the sum of the maximum and minimum values. 
        The coefficient of range is a measure of relative dispersion in a distribution and it varies between 0 and 1.
        '''
        max_value = max(self.df[column_name])
        min_value = min(self.df[column_name])
        range_value = max_value - min_value
        coefficient_of_range_value = range_value / (max_value + min_value)
        return coefficient_of_range_value
    
    def coefficient_of_variation(self, column_name):
        '''
        calculates the coefficient of variation, which is the ratio of the standard deviation to the mean. 
        The coefficient of variation is a useful statistic for comparing the degree of variation from one data series to another
        '''
        return self.standard_deviation(column_name) / self.arithmetic_mean(column_name)
    
    def coefficient_of_standard_deviation(self, column_name):
        '''
        calculates the arithmetic mean (average) of the values in the column specified by column_name in the DataFrame
        calculates the standard deviation of the same column. The standard deviation is a measure of the amount of variation or dispersion in a set of values. 
        It is calculated as the square root of the average of the squared differences from the mean
        find the maximum and minimum values in the column and calculate the range by using min and max values
        calculates the coefficient of standard deviation, which is the ratio of the standard deviation to the range. 
        The coefficient of standard deviation is a measure of relative variability in a distribution
        '''
        mean = sum(self.df[column_name]) / len(self.df[column_name])
        std_dev = (sum((x - mean) ** 2 for x in self.df[column_name]) / len(self.df[column_name])) ** 0.5
        max_value = max(self.df[column_name])
        min_value = min(self.df[column_name])
        range_value = max_value - min_value
        coefficient_of_std_dev_value = std_dev / range_value
        return coefficient_of_std_dev_value

    
    def covariance(self, column_name1, column_name2):
        '''
        Calculates the correlation between two columns in the DataFrame.

        Parameters:
        - column_name1 (str): The name of the first column.
        - column_name2 (str): The name of the second column.

        Returns:
        - float: The correlation coefficient between the two columns.
        '''
        mean_x = self.df[column_name1].mean()
        mean_y = self.df[column_name2].mean()

        covariation = ((self.df[column_name1] - mean_x) * (self.df[column_name2] - mean_y)).sum() / (len(self.df) - 1)
        return covariation
    
    def correlation(self, column_name1, column_name2):
        '''
        Calculate the correlation coefficient between two columns in the DataFrame.

    Parameters:
    - column_name1 (str): Name of the first column.
    - column_name2 (str): Name of the second column.

    Returns:
    float: The correlation coefficient between the specified columns.

    This method calculates the correlation coefficient between two columns using
    the formula:

    correlation = numerator / (denominator1**0.5 * denominator2**0.5)

    where:
    - numerator is the sum of the product of the differences between each value 
      and the mean of the respective columns,
    - denominator1 is the sum of the squared differences between each value 
      and the mean of the first column,
    - denominator2 is the sum of the squared differences between each value 
      and the mean of the second column.

    Note:
    The method assumes that the lengths of the two columns are the same.
        '''
        
        mean1 = self.df[column_name1].mean()
        mean2 = self.df[column_name2].mean()
    
        numerator = sum((self.df[column_name1][i] - mean1) * (self.df[column_name2][i] - mean2) for i in range(len(self.df)))
        denominator1 = sum((self.df[column_name1][i] - mean1)**2 for i in range(len(self.df)))
        denominator2 = sum((self.df[column_name2][i] - mean2)**2 for i in range(len(self.df)))
    
        correlation = numerator / (denominator1**0.5 * denominator2**0.5)
    
        return correlation

In [68]:
path = r"C:\Users\master\Downloads\Iris.csv"
iris = Statistics(path)

In [39]:
a_mean = iris.arithmetic_mean('SepalLengthCm')
a_mean

5.843333333333335

In [40]:
iris.geometric_mean('SepalLengthCm')

5.785720390427729

In [41]:
iris.harmonic_mean('SepalLengthCm')

5.728905057850834

In [42]:
iris.mode('SepalLengthCm')

5.0

In [43]:
iris.median('SepalLengthCm')

6.5

In [44]:
iris.variance('SepalLengthCm')

0.6811222222222222

In [45]:
iris.standard_deviation('SepalLengthCm')

0.8253012917851409

In [46]:
iris.max('SepalLengthCm')

7.9

In [47]:
iris.min('SepalLengthCm')

4.3

In [48]:
iris.range('SepalLengthCm')

3.6000000000000005

In [49]:
iris.iqr('SepalLengthCm')

1.3000000000000007

In [50]:
iris.quartiles('SepalLengthCm')

(4.9, 6.6, 6.8)

In [51]:
iris.coefficient_of_range('SepalLengthCm')

0.2950819672131148

In [52]:
iris.coefficient_of_variation('SepalLengthCm')

0.1412380989934639

In [53]:
iris.coefficient_of_standard_deviation('SepalLengthCm')

0.22925035882920577

In [69]:
iris.covariance('SepalLengthCm', 'PetalLengthCm')

1.273682326621924

In [70]:
iris.correlation('SepalLengthCm', 'SepalWidthCm')

-0.10936924995064931