# Quick Analysis Tool

<a href = "http://myy.haaga-helia.fi/~taaak/">Aki Taanila</a> has written a <a href="https://nbviewer.org/github/taanila/tilastoapu/blob/master/pika.ipynb">Quick Analysis Tool with Python</a> (Finnish notebook). The result of the tool is an Excel file where you find frequency tables, cross tabulations and essential statistical key figures for analyzing your research data. In this section the Quick Analysis Tool is briefly explained.

- You can use your own data below. If you want to read some basic properties of Python and data analytics, read e.g. <a href="http://myy.haaga-helia.fi/~nurju/Teaching/#data-analytics-basics">Basics of Data Analytics with Python</a>.
- Run the code.
- After running the Python code you find the resulting Excel file open on your desktop.

<a href = "http://myy.haaga-helia.fi/~taaak/">Aki Taanila</a>'s <a href="https://nbviewer.org/github/taanila/tilastoapu/blob/master/pika.ipynb">Quick Analysis Tool with Python</a>:

In [1]:
import pandas as pd
import xlwings as xw

## If the variable has unique values more than the value of Limit, the variable is considered as quantitative. The value of Limit can be changed.

Limit = 10

## Change your own data here to be fetch from the URL inside the apostrophes
## df = pd.read_excel('http://myy.haaga-helia.fi/~menetelmat/Data-analytiikka/Teaching/data1_en.xlsx')

### Data can also be retrived from Excel file that open on your desktop. Letäs do taht

df = xw.load()

In [2]:
### Preliminary actions

### Define the lists; if necessary, leave them empty (mere brackets [])

quantitative = []

categorical = []

for var in df:
    if (len(df[var].unique()) > Limit) & (df[var].dtype in ['int64', 'float64']):
        quantitative.append(var)
    elif len(df[var].unique()) <= Limit:
        categorical.append(var)


### Prepare Excel

app = xw.App(visible = False)

### Excel Sheets

wb = xw.books.active
ws1= wb.sheets.active
ws1.name = 'frequency_tables'
ws2 = wb.sheets.add('cross-tabulations', after=wb.sheets.count)
ws3 = wb.sheets.add('statistical_numbers', after=wb.sheets.count)
ws4 = wb.sheets.add('correlations', after=wb.sheets.count)

In [3]:
### Frequency tables

if categorical:
    # ### We keep a running tally of Excel's row number in the variable called row
    row = 1
    column = 1
    ### We go through all the categorical variables using the for loop
    for var in categorical:
        ## Calculate frequencies into dataframe named df1
        df1 = pd.crosstab(df[var], 'f')
        ## Calculate percentages into df1
        df1['%'] = df1/df1.sum()
        ## Add Total row into df1
        df1.loc['Total'] = df1.sum()
        
        # Write df1 into the sheet frequencies sheet of the Excel file
        ws1.range((row, column)).value = df1
        
        # Formatting
        ws1.range((row+1, column+2),(row+len(df1), column+2)).number_format = '0,0 %'
        ws1.range((row, column+1), (row, column+2)).api.HorizontalAlignment = -4152
        ws1.range((row, column), (row, column+2)).api.Borders(9).Weight = 2
        ws1.range((row+len(df1)-1, column), (row+len(df1)-1, column+2)).api.Borders(9).Weight = 2
        
         # Increase the row number; shape[0] gives the number of rows of the dataframe df1
        row = row + df1.shape[0] + 2


In [4]:
### Cross-tabulations

if len(categorical) > 1:
    row = 1
    column = 1
    for var1 in categorical:
        for var2 in categorical:
            if var1 != var2:
                df1 = pd.crosstab(df[var1], df[var2])
                df2 = pd.crosstab(df[var1], df[var2], normalize='columns')
                df2.index.name = var1+'/'+var2 
                df2.loc['n'] = df1.sum()
                
                ## Write the cross-tabulations to the Excel file
                ws2.range((row, column)).value = df2
                
                ## Formatting
                ws2.range((row+1, column+1),(row+len(df1), column+df1.shape[1])).number_format = '0,0 %'
                ws2.range((row, column+1), (row, column+df1.shape[1])).api.HorizontalAlignment = -4152
                ws2.range((row, column), (row, column+df1.shape[1])).api.Borders(9).Weight = 2
                ws2.range((row+len(df1), column), (row+len(df1), column+df1.shape[1])).api.Borders(9).Weight = 2
                                    
                ## Increase the row number
                row = row+df2.shape[0]+2


In [5]:
### Statistical key figures
            
### Statistical key figures for quantitative variables to be written in the sheet named statistical numbers

### Names of the statistical numbers

statistical_numbers = ['Count', 'Mean', 'Standard Deviation', 'Minimum', 
              'Lower Quartile', 'Median', 'Upper Quartile', 'Maximum']

if quantitative:
    row = 1
    column = 1
    df1 = df[quantitative].describe()
    
    # Names of statistical numbers used
    df1.index = statistical_numbers
    
    # Write the table of statistical numbers into the Excel file
    ws3.range(row, column).value = df1

    # Adjustment of column width
    ws3.range('A:A').autofit()

    # Formatting
    ws3.range((row+2, column+1),(row+len(df1), column+df1.shape[1])).number_format = '0,0'
    ws3.range((row, column+1), (row, column+df1.shape[1])).api.HorizontalAlignment = -4152
    ws3.range((row, column), (row, column+df1.shape[1])).api.Borders(9).Weight = 2
    ws3.range((row+len(df1), column), (row+len(df1), column+df1.shape[1])).api.Borders(9).Weight = 2

    
## Statistical key figures in groups defined by categorical variables

if categorical:
    row = df1.shape[0]+3
    for var1 in categorical:
        for var2 in quantitative:
            if var1 != var2:
                df1 = df.groupby(var1)[var2].describe()
                
                # Names of statistical numbers used
                df1.columns = statistical_numbers
                
                df1.index.name = var1 + '/' + var2
                    
                # Write the table of statistical numbers into the Excel file
                ws3.range(row, column).value = df1
                
                # Formatting
                ws3.range((row+1, column+2),(row+len(df1), column+df1.shape[1])).number_format = '0,0'
                ws3.range((row, column+1), (row, column+df1.shape[1])).api.HorizontalAlignment = -4152
                ws3.range((row, column), (row, column+df1.shape[1])).api.Borders(9).Weight = 2
                ws3.range((row+len(df1), column), (row+len(df1), column+df1.shape[1])).api.Borders(9).Weight = 2
                    
                # Increase the row number
                row = row + df1.shape[0]+2

# Adjustment of the column widths
ws3.range('B:I').column_width = 11


In [6]:
### Correlations

if quantitative:
    row = 1
    range = 1
    df1 = df[quantitative].corr()
    
    # Write the table of correlations into the Excel file
    ws4.range(row, column).value = df1
    
    # Formatting
    ws4.range((row+1, column+1),(row+len(df1), column+df1.shape[1])).number_format = '0,000'
    ws4.range((row, column+1), (row, column+df1.shape[1])).api.HorizontalAlignment = -4152
    ws4.range((row, column), (row, column+df1.shape[1])).api.Borders(9).Weight = 2
    ws4.range((row+len(df1), column), (row+len(df1), column+df1.shape[1])).api.Borders(9).Weight = 2

In [7]:
### Make the instance of the Excel file visible

ws1.activate()
app.visible = True

Source: Aki Taanila, https://nbviewer.org/github/taanila/tilastoapu/blob/master/pika.ipynb

In [8]:
import datetime
print(f'Last modified {datetime.datetime.now():%Y-%m-%d %H:%M} by Juha Nurmonen')

Last modified 2023-04-21 19:41 by Juha Nurmonen
