In [20]:
# Structure of your solution to Assignment 1 
import pandas as pd
import numpy as np
from pandas import crosstab
np.seterr(divide = 'ignore')

def analyze_data(filepath):
    
    # add your code here
    df=pd.read_csv(filepath)
    question_df=df.sort_values(by='viewcount',ascending=False)      # sort the dataframe by viewount column
    temp=question_df[question_df['answercount']>0]                  # filter df where answercount>0 and put it in temp
    temp=temp.head(3)                                               # filter the top three records only
    print(temp[['title','viewcount']],'\n')                         # Print two columns title and viewcount
    temp_new=question_df.groupby('quest_name')['quest_name'].count()
    print("Top users with number of answers:\n",temp_new.sort_values(ascending=False).head())
    df['first_tag']=df['tags'].str.split(',').str[0]                # Split by ',' and get the section before first ',' 
    
    python_df=df.loc[df['first_tag']=='python']
    pandas_df=df.loc[df['first_tag']=='pandas']
    dataframe_df=df.loc[df['first_tag']=='dataframe']
    
    print('\n')
    mean_max_min=['mean','max', 'min']
    print('Stats of Python questions: \n',python_df['viewcount'].agg(mean_max_min),'\n')
    print('Stats of Pandas questions: \n',pandas_df['viewcount'].agg(mean_max_min),'\n')
    print('Stats of Dataframe questions: \n',dataframe_df['viewcount'].agg(mean_max_min),'\n')
    
    cross_tab=pd.crosstab(index=[df.answercount],columns=[df.first_tag],values=df.answercount,aggfunc='count')
    print("Number of Answers on each language by answercount:\n",cross_tab.head(5))

    
    
def analyze_tf_idf(arr,K):
    
    tf_ifd=None
    top_k=None
    
    # add your code here     
    tf=[]
    length_arr = np.sum(arr,axis=1)                      # find the length of each row
    numrows = len(arr)
    for i in range(numrows):
        tf.append(arr[i]/length_arr[i])                  # Form tf by diving it by length of document 
    
    df=np.sum(arr,axis=0)
    
    tf_idf= tf / (np.log(df)+1)
    
    sorted_tf_idf=np.argsort(tf_idf)                     # Array in ascending order
    sorted_tf_idf=sorted_tf_idf[:,::-1]                  # top 3 values at the start by reversing the array by columns
    top_k=sorted_tf_idf[:K,:K]                           # get the top_K array of the shape (M, K)
  
    return tf_idf, top_k


def analyze_corpus(filepath):
    
    # add your code here
    df=pd.read_csv(filepath)
    title=df['title']
    title_lower=title.str.lower()
    title_split=title_lower.str.split(' ')
    
    document_list=[]
    word_list=[]    
    for document in title_split:                                    # document is the sentence
        document_list.append(document)                              # document_list containing all sentences
        for words in document:                                      # word_list containing all words
            word_list.append(words)
    set_word_list=list(set(word_list))
    
    freq_array=[]
    for sentence in document_list:
        for words in set_word_list:
            freq_array.append(sentence.count(words))          
    freq_array=np.array(freq_array)                                 # Convert to numpy array
    len_document_list=len(document_list)                            # find the length of document_list
    len_word_list=len(set_word_list)                                # find the length of set_word_list
    freq_array=freq_array.reshape(len_document_list,len_word_list)  # Convert to the shape of len(document)*set_words_list 
    
    array_questions=freq_array[0:20,:]                              # select first n questions
    freq_array_td_idf, freq_array_top_K=analyze_tf_idf(array_questions,20) # Call the function analyze_tf_idf
    
    k=-1
    tf_idf_top_words_dict={}
    for i in freq_array_top_K[:,:10]:                                 # Use the top n words
        k+=1
        for j in i:
            tf_idf_top_words_dict[j] = freq_array_td_idf[k][j]        # populate the dictionary
    #print(tf_idf_top_words_dict)        
    sorted_list=[]
    for keys, values in sorted(tf_idf_top_words_dict.items(), key=lambda k:(k[1], k[0]),reverse=True):  #sort dict by value
        sorted_list.append((keys))                                   # Only append the loaction of top words
    top_sorted=sorted_list[:10]                                      # Top 10 words from the sorted list
    for i in top_sorted:
        print(set_word_list[i])

# best practice to test your class
# if your script is exported as a module,
# the following part is ignored
# this is equivalent to main() in Java


if __name__ == "__main__":  
    
    # Test 1
    arr=np.array([[0,1,0,2,0,1],[1,0,1,1,2,0],[0,0,2,0,0,1]])
    
    print("Tfidf score of each element in array: ")
    tf_idf, top_k=analyze_tf_idf(arr,3)
    print(tf_idf)
    print('\nTop 3 array element position by tfidf score:\n',top_k)
    # Test 2
    print("\nTop questions by viewcount:")
    print(analyze_data('C:/Users/HP/Downloads/question.csv'))
    
    # test 3
    print("\nTop words from the questions asked:")
    analyze_corpus('C:/Users/HP/Downloads/question.csv')
    

Tfidf score of each element in array: 
[[0.         0.25       0.         0.23825268 0.         0.14765403]
 [0.2        0.         0.09530107 0.09530107 0.23624644 0.        ]
 [0.         0.         0.31767024 0.         0.         0.19687204]]

Top 3 array element position by tfidf score:
 [[1 3 5]
 [4 0 3]
 [2 5 4]]

Top questions by viewcount:
                                                 title  viewcount
75   Python: Pandas pd.read_excel giving ImportErro...      33297
163                     Python convert object to float      16658
886                  Subtract two columns in dataframe      11176 

Top users with number of answers:
 quest_name
Shuvayan Das    7
Rahul rajan     7
Danny W         6
el323           6
user1017373     5
Name: quest_name, dtype: int64


Stats of Python questions: 
 mean      428.670091
max     33297.000000
min         5.000000
Name: viewcount, dtype: float64 

Stats of Pandas questions: 
 mean     454.6875
max     4499.0000
min       14.0000
Name: