In [None]:
import csv
import operator
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

def task1(file_name, attribute_name):
    attributeIndex = -1
    valueDict = {}
    try:
        with open(file_name) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                # reading headers
                if line_count == 0:
                    for i in range(len(row)):
                        if row[i] == attribute_name:
                            attributeIndex = i
                            break
                    # Cannot find attribute, return
                    if attributeIndex == -1:
                        print('[-] Cannot find attibute name')
                        return
                    line_count += 1
                else:
                    #counting
                    value = row[attributeIndex]
                    if value in valueDict:
                        valueDict[value] += 1
                    else:
                        valueDict[value] = 1
                    line_count += 1
        # close csv file
        csv_file.close()
        # (1) break ties 
        # sort by name first 
        sortedValue = sorted(valueDict.items(),key = operator.itemgetter(0))
        # sort by value late
        sortedValue.sort(key=operator.itemgetter(1), reverse=True)
        # (2) Text output
        outputFile = attribute_name+'.txt'
        with open(outputFile, 'w+') as of:
            for value in sortedValue:
                of.write(str(value[0])+ ' ' + str(value[1])+'\n')
        print('[+] text output finished !')
        # (3) Figure output
        numAttributeValues = len(sortedValue) 
        attributeValues = []
        frequency = []
        if numAttributeValues > 10:
            for i in range(5):
                attributeValues.append(sortedValue[i][0])
                frequency.append(sortedValue[i][1])
            for i in range(5):
                attributeValues.append(sortedValue[numAttributeValues+i-5][0])
                frequency.append(sortedValue[numAttributeValues+i-5][1])
        else:
            attributeValues = [item[0] for item in sortedValue]
            frequency = [item[1] for item in sortedValue]

        fig, ax =plt.subplots()
        ind = np.arange(len(attributeValues)) #?
        width = .3
        bar = ax.bar(ind, frequency, width, color='magenta')
        ax.set_ylabel('Frequency')
        ax.set_xlabel('Attribute Names')
        ax.set_xticks(ind)
        ax.set_xticklabels(attributeValues,rotation=90) # to display everything
        fileName = attribute_name+' '+str(numAttributeValues)+'.pdf'
        fig.savefig(fileName,bbox_inches='tight')
        print('[+] figure output finished !')
    except FileNotFoundError:
        print("[-] Wrong file or file path")

In [None]:
import csv
import operator
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

def task2(file_name):
    try:
        pairAttributeDict={}
        pairValueDict={}
        with open(file_name) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            attributeCount=0
            for row in csv_reader:
                # reading headers + initialize 2-d dictionary
                if line_count == 0:
                    attributeCount=len(row)
                    for i in range(len(row)):         # first scan
                        pairAttributeDict[i]={}
                        pairValueDict[i]={}
                        for j in range(i+1,len(row)): # second scan
                            pairAttributeDict[i][j] = row[i]+'_'+row[j]
                            pairValueDict[i][j]={}
                    line_count += 1
                else: 
                    #count here
                    for i in range(attributeCount): # len(row) != attributeCount
                        first = row[i]
                        for j in range(i+1,attributeCount):
                            second   = row[j]
                            if second == '':
                                second ='(empty)'             #make empty string
                            tempPair = first +'$'+second      #to avoid split space later
                            if tempPair in pairValueDict[i][j]:
                                pairValueDict[i][j][tempPair] +=1
                            else:
                                pairValueDict[i][j][tempPair]  =1
                    line_count += 1
        # close csv file
        csv_file.close()
        for i in range(attributeCount):
            for j in range(i+1,attributeCount):
                outputFile = pairAttributeDict[i][j]+'.txt'
                # sort pairValueDict here
                # (1) break ties 
                # sort by name first, this consider first 2 things as one word 
                sortedValue = sorted(pairValueDict[i][j].items(),key = operator.itemgetter(0))
                # sort by value later
                sortedValue.sort(key=operator.itemgetter(1), reverse=True)
                # (2) write to output file
                with open(outputFile, 'w+') as of:
                    for value in sortedValue:
                        firstAtt, secondAtt = value[0].split("$")
                        of.write(firstAtt+' '+ secondAtt+' '+ str(value[1])+'\n')
                    print("[+] text output finished !")
                #close output file
                of.closed
                # (3) generate heat map directly from sortedValue
                #  sortedValue[0]= ['att1$att2' 'count']
                firstAttribute = []
                secondAttribute= []
                for value in sortedValue:
                    firstAtt, secondAtt = value[0].split("$")
                    if firstAtt not in firstAttribute:
                        firstAttribute.append(firstAtt)
                    if secondAtt not in secondAttribute:
                        secondAttribute.append(secondAtt)
                dimension = (len(firstAttribute),len(secondAttribute))
                countMatrix=np.zeros(dimension, dtype=int)
                for x in range(len(firstAttribute)):
                    for y in range(len(secondAttribute)):
                        tempWord = firstAttribute[x]+'$'+secondAttribute[y]
                        if tempWord in pairValueDict[i][j]:
                            countMatrix[x][y] = pairValueDict[i][j][tempWord]
                print("[+] done with matrix. Plotting heatmap...")
                # matplot
                fig, ax =plt.subplots()
                ax = sns.heatmap(countMatrix,
                                 cmap='YlGnBu')
                ind = np.arange(len(firstAttribute))
                ax.set_xticks(ind)
                ax.set_xticklabels(firstAttribute,rotation=90)
                ind1 = np.arange(len(secondAttribute))
                ax.set_yticks(ind1)
                ax.set_yticklabels(firstAttribute,rotation=90)
                plt.show()

    except FileNotFoundError:
        print("[-] Wrong file or file path")

In [None]:
task2('HW1_input.csv')

In [None]:
print(range(1,10)[0])

In [None]:
harvest = np.array([[0.8, 2.4, 2.5, 3.9, 0.0, 4.0, 0.0],
                    [2.4, 0.0, 4.0, 1.0, 2.7, 0.0, 0.0],
                    [1.1, 2.4, 0.8, 4.3, 1.9, 4.4, 0.0],
                    [0.6, 0.0, 0.3, 0.0, 3.1, 0.0, 0.0],
                    [0.7, 1.7, 0.6, 2.6, 2.2, 6.2, 0.0],
                    [1.3, 1.2, 0.0, 0.0, 0.0, 3.2, 5.1],
                    [0.1, 2.0, 0.0, 1.4, 0.0, 1.9, 6.3]])

In [None]:
print(harvest)

In [None]:
a = []
a.append(1)
a.append(1)

In [None]:
print(a)