In [None]:
# uncommment below line to install ruptures if not already installed

# !python -m pip install ruptures

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ruptures as rpt
from scipy.stats import zscore, linregress, iqr
import numpy as np
import xlsxwriter
import os
from datetime import datetime

# Please enter the below details

In [None]:
penalty = 3 # higher means less cells
chg_thd = 0.0
dim = 'Whole Network' # dimension you are interested in
date = 'Date'     # date variable
source_file_path = "C:\\Users\\BronyahJ\\Downloads\\" # where your source data is located
file_name = "4G_ALL_KPIs_Query_Result_20230213092527290.xlsx" # source filename

# Specify columns you want to delete in the to_delete list

In [None]:
REMOVE_NA = True
to_delete = ['LocalCell Id','Integrity', 'Cell ID', 'Cell CI', 'CellIndex','NR Cell ID']
KNOWN_NA_VALS = ['NIL', 'NILL', 'NULL', 'NA', '#NA', '#N/A', 'N/A','#VALUE!','#REF!','#DIV/0!','#NUM!','#NAME?','#NULL!','NAN','nan','NaN']

In [None]:
df = pd.read_excel(source_file_path+file_name,sheet_name=0)
for i in np.arange(1,100):
    try:
        df1 = pd.read_excel(source_file_path+file_name,sheet_name=i)
        df = pd.concat([df,df1],axis=1)
        
    except IndexError as ie:
        break
    
    except ValueError as ve:
        break
    

In [None]:
df.head()

In [None]:
df.shape

In [None]:
if len(to_delete) > 0:
    df = df.drop([x for x in to_delete if x in df.columns], axis=1)

In [None]:
df=df.T.drop_duplicates().T

In [None]:
df.shape

In [None]:
cols=pd.Series(df.columns)
for dup in cols[cols.duplicated()].unique():
    cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

df.columns = cols

# Data cleaning and processing

In [None]:
############################################################################################
# Experimental code here. Trying to replace Known values which means NULL / NA to np.nan
# Ideally this should help our analysis.
############################################################################################
for x in [x for x in df.columns if df[x].dtype.kind.lower() in ('o', 's', 'u', 'v')]:
    if len(df[df[x].isin(KNOWN_NA_VALS)]) > 0: #some instances were found with Known NA substitutions
        df.loc[ df[x].isin(KNOWN_NA_VALS), x ] = np.nan
        print("Found some known NA substitutions in {}. Will replace and try to force as numeric".format(x))
    try: #now we will try to see if the column can become numeric
            df[x] = pd.to_numeric(df[x], errors='raise')
            #print("this {} kpi has been converted".format(x))
    except ValueError as e:
        continue #Column cannot be converted to numeric. Just continue
############################################################################################

############################################################################################
# Experiment 2: If all endings are % or $ or # then we will try to strip these and check if 
#     the column can be converted as a numeric value
############################################################################################
sp_endings = ['%', '$', '#', '£', 'QAR', 'GBP', 'qar', 'gbp', 'usd', 'USD' ,'eur', 'EUR']

for x in [x for x in df.columns if df[x].dtype.kind.lower() in ('o', 's', 'u', 'v')]:
    for sp in sp_endings:
        totals = df[x].astype(str).str.endswith(sp).sum() + df[x].isna().sum()
        if totals == len(df): #Either all entries end with special char or are null
            temp = df[x].astype(str).str.replace(sp, '')
            try:
                temp_numeric = pd.to_numeric(temp, errors='raise')
                df[x] = temp_numeric #if we were able to convert to numeric then we keep this
                                    # in our dataframe. else no change
                print("Modified column {} for special endings {} and changed to numeric".format(
                        x, sp))
            except ValueError as ve:
                continue
############################################################################################
# Same code as above but for string beginnings

for x in [x for x in df.columns if df[x].dtype.kind.lower() in ('o', 's', 'u', 'v')]:
    for sp in sp_endings:
        totals = df[x].astype(str).str.startswith(sp).sum() + df[x].isna().sum()
        if totals == len(df): #Either all entries end with special char or are null
            temp = df[x].astype(str).str.replace(sp, '')
            try:
                temp_numeric = pd.to_numeric(temp, errors='raise')
                df[x] = temp_numeric #if we were able to convert to numeric then we keep this
                                    # in our dataframe. else no change
                print("Modified column {} for special startings {} and changed to numeric".format(
                        x, sp))
            except ValueError as ve:
                continue
############################################################################################
# Now we have to deal with NA values
if REMOVE_NA == True:
    df = df.dropna(axis=0, how='all')
else:
    for x in [x for x in df.columns]:
        if df[x].dtype.kind in ('f', 'c', 'i', 'u'):
            df[x].fillna(df[x].median(),inplace=True)
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
############################################################################################

    
############################################################################################    
# WARNING - DONT REMOVE BELOW WITHOUT UNDERSTANDING OF THE CODE BIT
# This step is mandatory. We will delete any column if it is Completely np.nan
a = df.isna().sum(axis=0)
FULL_NA_COLS = [x for x in a[df.isna().sum(axis=0) == len(df)].index]
df = df.drop(FULL_NA_COLS, axis=1)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df1=df.copy(deep=True)

In [None]:
df1.head()

In [None]:
kpis_chng = []
for y in [y for y in df1.columns if df1[y].dtype.kind.lower() not in ('o', 's', 'u', 'v','m')]:
    df2 = df1[y]
    df3 = df1[y]
    algo = rpt.Pelt(model="rbf").fit(df2.values)
    result = algo.predict(pen=penalty)
    if len(result)>1:
        for i in np.arange(0,len(df3)):
            df3.iloc[i] -= df3.values.min()
            if (df3.values.max()-df3.values.min()) > 0:
                df3.iloc[i] /= (df3.values.max()-df3.values.min())
            else:
                df3.iloc[i] = 0
        
        cutoff_low=np.nanmean(df3.values) - (3*np.nanstd(df3.values))
        cutoff_high = np.nanmean(df3.values) + (3*np.nanstd(df3.values))
        
        slope = linregress(range(len(df3.values)), df3.values).slope
        
        post = np.nanmean(df3[result[0]:result[-1]])
        pre  = np.nanmean(df3[0:result[0]][df3[0:result[0]]<=cutoff_high][df3[0:result[0]]>=cutoff_low])
        
            
        chg_perc= (post - pre)/(pre + 0.0000000000001)
        if abs(chg_perc)>=chg_thd:
            kpis_chng.append(str(y)+"___"+str(abs(chg_perc))+"___"+str(abs(slope))+"___"+str(result[0])+"___"+str(len(result)))

In [None]:
df_kpi_chng = pd.DataFrame (kpis_chng, columns = ['kpi_chg'])

In [None]:
df_kpi_c=df_kpi_chng['kpi_chg'].str.split('___', expand=True)

In [None]:
df_kpi_c.sort_values(by=[3,4,2,1],inplace=True,ascending=False)

In [None]:
kpis_changed = df_kpi_c[0].tolist()

In [None]:
len(kpis_changed)

In [None]:
for i in np.arange(0,len(kpis_changed)):
    plt.figure(figsize = (10, 5))
    algo = rpt.Pelt(model="rbf").fit(df[kpis_changed[i]].values)
    result = algo.predict(pen=penalty)
    a=rpt.display(df[kpis_changed[i]].values, result)
    plt.title(str(df[kpis_changed[i]].head(0)))
    plt.xticks(np.arange(0,len(df[kpis_changed[i]].values)),df[date],rotation='vertical')
    print(df[kpis_changed[i]].head(0))
    plt.show()