In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
#Some Notes:
# Integrate all the articles into one Dataframe, which enables us to do more quick computation of probabilities
# We label each article with two types of labels, one is journal type, one is region_type (which only economist will have a region type)
# Seperate train and test group by the algorithm of 10-fold cross_validation
# Fixing a train group，P(class) is fixed for each class, conditional probability is calculated by counting
# Here in order to simplify the problem, we do not update our whole vocab list in different training set
# Conditional probability of a word w is count the appearance of word w appear in all words of class c in training group
# Do remember to do a laplace transform to those word count

In [3]:
def txt2df(file_name,folder_name,path_name):
    '''
    A helper function to convert txt word_count vectors into dataframes so that we can concate all the articles into
    a big dataframe
    
    '''
    # handel the journal type
    if file_name[:5] == "onion":
        is_onion = True
    else:
        is_onion = False
    #handel the region type
    region = folder_name[:-6]
    full_file_name = os.path.join(path_name,folder_name,file_name)
    f = open(full_file_name,encoding ='ISO-8859-1')
    lines = f.readlines()
    columns = []
    word_counts = np.array([])
    for line in lines: 
        name = line.split()[0]
        count = line.split()[1]
        columns.append(name)
        word_counts = np.append(word_counts,int(count))
    f.close()
    ret = pd.DataFrame(word_counts.reshape(1,-1),columns = columns)
    if is_onion:
        ret["journal_type"] = "onion"
        ret["region_type"] = None
    else:
        ret["journal_type"] = "economist"
        ret["region_type"] = region
    ret["article"] = file_name[:-6]
    return ret

# there might be hidden files like .Ds_store, and we do not want those files being included in our listdir
def listdir_nohidden(path): 
    ret = []
    for f in os.listdir(path):
        if not f.startswith('.'):
            ret.append(f)
    return ret
            
def generate_df():
    
    '''
    Since we transformed all our word count to each article in vector form and store them in txt
    
    This function just join all the articles vectors into one big dataframe and do the labelling for each article  
    '''
    
    sub_df = None
    econ_path_name = "/Users/yuanzheyang/Desktop/nyu material/7871/HW1/economist_wfreq"
    onion_path_name = "/Users/yuanzheyang/Desktop/nyu material/7871/HW1/onion_wfreq"
    econ_folder_names = listdir_nohidden(econ_path_name)
    print(econ_folder_names)
    for econ_folder_name in econ_folder_names:
        econ_file_names = listdir_nohidden(os.path.join(econ_path_name,econ_folder_name))
        print(econ_file_names)
        for econ_file_name in econ_file_names:
            sub_df = pd.concat([sub_df,txt2df(econ_file_name,econ_folder_name,econ_path_name)],axis = 0)
    print(sub_df)
    onion_folder_names = listdir_nohidden(onion_path_name)
    print(onion_folder_names)
    for onion_folder_name in onion_folder_names:
        onion_file_names = listdir_nohidden(os.path.join(onion_path_name,onion_folder_name))
        print(onion_file_names)
        for onion_file_name in onion_file_names:
            sub_df = pd.concat([sub_df,txt2df(onion_file_name,onion_folder_name,onion_path_name)],axis = 0)
    return sub_df
            

# Step1: Generate Dataset

In [4]:
path_name = "/Users/yuanzheyang/Desktop/nyu material/7871/HW1/economist_wfreq"
folder_name = "africa_wfreq"
file_name = "africa.1.wfreq"
x = txt2df(file_name,folder_name,path_name)

In [5]:
df = generate_df()

['africa_wfreq', 'britain_wfreq', 'north_america_wfreq', 'latin_america_wfreq', 'asia_wfreq', 'international_wfreq', 'europe_wfreq']
['africa.72.wfreq', 'africa.37.wfreq', 'africa.56.wfreq', 'africa.13.wfreq', 'africa.54.wfreq', 'africa.11.wfreq', 'africa.69.wfreq', 'africa.70.wfreq', 'africa.35.wfreq', 'africa.9.wfreq', 'africa.49.wfreq', 'africa.74.wfreq', 'africa.31.wfreq', 'africa.50.wfreq', 'africa.15.wfreq', 'africa.28.wfreq', 'africa.52.wfreq', 'africa.17.wfreq', 'africa.33.wfreq', 'africa.68.wfreq', 'africa.10.wfreq', 'africa.55.wfreq', 'africa.8.wfreq', 'africa.34.wfreq', 'africa.71.wfreq', 'africa.36.wfreq', 'africa.73.wfreq', 'africa.12.wfreq', 'africa.57.wfreq', 'africa.16.wfreq', 'africa.53.wfreq', 'africa.32.wfreq', 'africa.30.wfreq', 'africa.48.wfreq', 'africa.29.wfreq', 'africa.14.wfreq', 'africa.51.wfreq', 'africa.18.wfreq', 'africa.60.wfreq', 'africa.25.wfreq', 'africa.44.wfreq', 'africa.2.wfreq', 'africa.46.wfreq', 'africa.62.wfreq', 'africa.27.wfreq', 'africa.66.wfr

      A     a  fleet   26   AP   At  weapons   He   In   It  ...  Jens  \
0   1.0  12.0    1.0  1.0  2.0  1.0      1.0  2.0  1.0  1.0  ...   NaN   
0   1.0   4.0    NaN  NaN  NaN  NaN      NaN  NaN  NaN  2.0  ...   NaN   
0   NaN   7.0    NaN  NaN  NaN  NaN      NaN  1.0  2.0  NaN  ...   NaN   
0   1.0  26.0    NaN  NaN  NaN  1.0      NaN  1.0  3.0  NaN  ...   NaN   
0   1.0  25.0    NaN  NaN  NaN  NaN      NaN  1.0  3.0  3.0  ...   NaN   
..  ...   ...    ...  ...  ...  ...      ...  ...  ...  ...  ...   ...   
0   3.0  14.0    NaN  NaN  NaN  NaN      NaN  NaN  NaN  NaN  ...   NaN   
0   NaN  23.0    NaN  NaN  NaN  NaN      NaN  NaN  1.0  NaN  ...   NaN   
0   NaN  26.0    NaN  NaN  NaN  NaN      NaN  2.0  3.0  NaN  ...   NaN   
0   1.0   8.0    NaN  NaN  NaN  NaN      NaN  NaN  1.0  1.0  ...   NaN   
0   1.0   9.0    NaN  NaN  NaN  NaN      NaN  NaN  1.0  NaN  ...   1.0   

    ex-communists,  CDU)  CDU.  ago;  Böhrnsen  inheritance.  paves  \
0              NaN   NaN   NaN   NaN    

In [6]:
df = df.reset_index(drop = True)

# Step2: construct My Naive Bayes Classifier

In [20]:
class My_Bayes_Classifier:
    
    def __init__(self,df_wholedata):
        self.original_df = df_wholedata
        self.shuffled_df = self.original_df.sample(frac = 1)
        self.Vocab_size = self.original_df.shape[1] - 3
        self.cv_result_list_1 = []
        self.cv_result_list_2 = []
        self.confusion_matrices_1 = []
        self.confusion_matrices_2 = []
        self.recall_1 = []
        self.recall_2 = []
        self.precision_1 = []
        self.precision_2 = []
        self.accuracy_1 = []
        self.accuracy_2 = []
        
    def shuffle(self):
        #random shuffle the whole dataset in order to split train and test
        self.shuffled_df = self.original_df.sample(frac = 1)
        
    def get_economist(self):
        # since only economist has region_type, we need to extract all the economist as our whole dataset before we doing task 2
        ret = self.shuffled_df.loc[self.shuffled_df.journal_type == "economist"]
        return ret
    
    def nb_algo(self,train_indices,test_indices,data_df,objective_classes,objective_type):
        '''
        The main algorithm of naive bayes
        '''
        class_probs = {}
        train_set = data_df.iloc[train_indices]
        test_set = data_df.iloc[test_indices]
        for objective_class in objective_classes:        
            Prob_is_class = train_set.loc[train_set[objective_type] == objective_class].shape[0]/train_set.shape[0]
            class_probs[objective_class] = Prob_is_class
        #print(class_probs)    
        def helper_func(x):
            
            new_x = x.drop(["journal_type","region_type","article"])
            words_in_test = new_x[new_x.notnull()].index.tolist()
            pred_prob_for_each_class = {}
            max_record = float("-inf")
            argmax = objective_classes[0]
            
            
            for objective_class in objective_classes:
                log_prob = 0 #initialize the log_prob to be log 1 which is set to be 0
                log_prob += np.log(class_probs[objective_class])   
                objected_train_set = train_set.loc[train_set[objective_type] == objective_class]
                #print(objected_train_set[objective_type])
                denominator = objected_train_set.drop(columns = ["journal_type","region_type","article"]).sum().sum() + self.Vocab_size
                conditional_probs = (objected_train_set[words_in_test].sum(axis = 0).add(1)).divide(denominator)
                #print(np.log(conditional_probs))
                #print(new_x[new_x.notnull()])
                conditional_probs = np.log(conditional_probs) * new_x[new_x.notnull()]
                
                log_prob += conditional_probs.sum()
                
                pred_prob_for_each_class[objective_class] = log_prob
                if (max_record != max(max_record,log_prob)):
                    argmax = objective_class
                    max_record = log_prob
            
            #x[objective_type + "pred_probs"] = pred_prob_for_each_class
            #x[objective_type + "pred_result"] = argmax
            return {"pred_prob_for_each_class":pred_prob_for_each_class,"argmax":argmax,"true_value":x[objective_type]}  
            
        pred = test_set.apply(lambda x: helper_func(x),axis = 1)
        
        
        return pred
            
            
    
    def cross_validation(self,K = 10):
        '''
        First shuffle the original dataset, than evenly split it into K fold
        
        Each time of training, we use one fold as testset, we keep looping through the folds.
        
        Record the True value and prediction value in each cv set.
        '''
        row_num_1 = self.shuffled_df.shape[0]
        row_num_2 = self.get_economist().shape[0]
        piece_size_1 = int(np.round(row_num_1/K,0))
        piece_size_2 = int(np.round(row_num_2/K,0))
        start_ind_1 = 0
        start_ind_2 = 0
        indices_1 = np.arange(row_num_1)
        indices_2 = np.arange(row_num_2)
        for i in range(10):
            print("Using Part {} fold as our test set".format(i+1))
            test_index_1 = indices_1[start_ind_1:(i+1)*piece_size_1]
            train_index_1 = np.append(indices_1[0:start_ind_1],indices_1[(i+1)*piece_size_1:row_num_1])
            objective_type_1 = "journal_type"
            objective_classes_1 = ["economist","onion"]
            test_ret_1 = self.nb_algo(train_index_1,test_index_1,self.shuffled_df,objective_classes_1,objective_type_1)
            print(test_ret_1)
            self.cv_result_list_1.append(pd.DataFrame(test_ret_1.to_dict()).T)
            
            
            test_index_2 = indices_2[start_ind_2:(i+1)*piece_size_2]
            train_index_2 = np.append(indices_2[0:start_ind_2],indices_2[(i+1)*piece_size_2:row_num_2])
            objective_type_2 = "region_type"
            objective_classes_2 = ["africa","asia","britain","europe","international","latin_america","north_america"]
            test_ret_2 = self.nb_algo(train_index_2,test_index_2,self.get_economist(),objective_classes_2,objective_type_2)
            print(test_ret_2)
            self.cv_result_list_2.append(pd.DataFrame(test_ret_2.to_dict()).T)
            
            start_ind_1 = (i+1)*piece_size_1
            start_ind_2 = (i+1)*piece_size_2
            print("Finished Part {} fold as our test set".format(i+1))
            
        
    def get_matrices(self):
        '''
        used to get different evaluation matrices
        For example: confusion matrix, recall, precision, accuracy
        
        '''
        def helper_func_2(result_list,objective_classes):
            df_list = []
            for df in result_list:
                df_dict = {}
                for objective_class in objective_classes:
                    sub_df = df.loc[df.true_value == objective_class]
                    sub_dict = {}
                    for sub_objective_class in objective_classes:
                        sub_sum = sub_df.loc[sub_df.argmax == sub_objective_class].shape[0]
                        sub_dict["selected_" + sub_objective_class] = sub_sum                 
                    df_dict["true_" + objective_class] = sub_dict
                df_list.append(pd.DataFrame(df_dict))
            return df_list
                  
        def helper_func_3(confusion_matrix,objective_classes):
            recall_dict = {}
            precision_dict = {}
            total_true = 0
            for objective_class in objective_classes:
                recall = confusion_matrix.loc["selected_" + objective_class,"true_" + objective_class]/confusion_matrix.loc["selected_" + objective_class].sum()
                recall_dict[objective_class + "_recall"] = recall
                precision = confusion_matrix.loc["selected_" + objective_class,"true_" + objective_class]/confusion_matrix["true_" + objective_class].sum()
                precision_dict[objective_class + "_precision"] = precision
                total_true += confusion_matrix.loc["selected_" + objective_class,"true_" + objective_class]
            accuracy = total_true/confusion_matrix.sum().sum()
            
            return recall_dict,precision_dict,accuracy
        
        objective_classes_1 = ["economist","onion"]
        df1_list = helper_func_2(nb.cv_result_list_1,objective_classes_1)
        self.confusion_matrices_1 = df1_list
        for confusion_matric_1 in self.confusion_matrices_1:
            recall_dict,precision_dict,accuracy = helper_func_3(confusion_matric_1,objective_classes_1)
            self.recall_1.append(recall_dict)
            self.precision_1.append(precision_dict)
            self.accuracy_1.append(accuracy)
        
        
        objective_classes_2 = ["africa","asia","britain","europe","international","latin_america","north_america"]
        df2_list = helper_func_2(nb.cv_result_list_2,objective_classes_2)
        self.confusion_matrices_2 = df2_list
        for confusion_matric_2 in self.confusion_matrices_2:
            recall_dict,precision_dict,accuracy = helper_func_3(confusion_matric_2,objective_classes_2)
            self.recall_2.append(recall_dict)
            self.precision_2.append(precision_dict)
            self.accuracy_2.append(accuracy)
        
        
        return None

In [21]:
nb = My_Bayes_Classifier(df)

In [22]:
nb.cross_validation(10)

Using Part 1 fold as our test set
451    {'pred_prob_for_each_class': {'economist': -75...
334    {'pred_prob_for_each_class': {'economist': -37...
296    {'pred_prob_for_each_class': {'economist': -48...
590    {'pred_prob_for_each_class': {'economist': -54...
461    {'pred_prob_for_each_class': {'economist': -39...
                             ...                        
670    {'pred_prob_for_each_class': {'economist': -66...
104    {'pred_prob_for_each_class': {'economist': -72...
300    {'pred_prob_for_each_class': {'economist': -87...
684    {'pred_prob_for_each_class': {'economist': -30...
637    {'pred_prob_for_each_class': {'economist': -49...
Length: 72, dtype: object
451    {'pred_prob_for_each_class': {'africa': -7980....
334    {'pred_prob_for_each_class': {'africa': -3832....
296    {'pred_prob_for_each_class': {'africa': -5023....
461    {'pred_prob_for_each_class': {'africa': -4048....
284    {'pred_prob_for_each_class': {'africa': -3685....
181    {'pred_prob_for_each_

325    {'pred_prob_for_each_class': {'africa': -8538....
331    {'pred_prob_for_each_class': {'africa': -3070....
512    {'pred_prob_for_each_class': {'africa': -4258....
410    {'pred_prob_for_each_class': {'africa': -426.9...
200    {'pred_prob_for_each_class': {'africa': -3557....
499    {'pred_prob_for_each_class': {'africa': -6122....
443    {'pred_prob_for_each_class': {'africa': -210.3...
486    {'pred_prob_for_each_class': {'africa': -9103....
482    {'pred_prob_for_each_class': {'africa': -8460....
314    {'pred_prob_for_each_class': {'africa': -210.3...
469    {'pred_prob_for_each_class': {'africa': -4960....
479    {'pred_prob_for_each_class': {'africa': -6606....
81     {'pred_prob_for_each_class': {'africa': -16332...
392    {'pred_prob_for_each_class': {'africa': -14791...
218    {'pred_prob_for_each_class': {'africa': -3326....
305    {'pred_prob_for_each_class': {'africa': -4353....
242    {'pred_prob_for_each_class': {'africa': -6238....
77     {'pred_prob_for_each_cla

463    {'pred_prob_for_each_class': {'economist': -46...
715    {'pred_prob_for_each_class': {'economist': -56...
686    {'pred_prob_for_each_class': {'economist': -68...
701    {'pred_prob_for_each_class': {'economist': -49...
72     {'pred_prob_for_each_class': {'economist': -26...
                             ...                        
436    {'pred_prob_for_each_class': {'economist': -23...
249    {'pred_prob_for_each_class': {'economist': -60...
42     {'pred_prob_for_each_class': {'economist': -43...
582    {'pred_prob_for_each_class': {'economist': -87...
33     {'pred_prob_for_each_class': {'economist': -66...
Length: 72, dtype: object
292    {'pred_prob_for_each_class': {'africa': -3264....
463    {'pred_prob_for_each_class': {'africa': -4996....
72     {'pred_prob_for_each_class': {'africa': -27104...
311    {'pred_prob_for_each_class': {'africa': -5178....
233    {'pred_prob_for_each_class': {'africa': -8206....
202    {'pred_prob_for_each_class': {'africa': -3974....
289  

681    {'pred_prob_for_each_class': {'economist': -47...
431    {'pred_prob_for_each_class': {'economist': -48...
205    {'pred_prob_for_each_class': {'economist': -51...
130    {'pred_prob_for_each_class': {'economist': -78...
678    {'pred_prob_for_each_class': {'economist': -55...
                             ...                        
657    {'pred_prob_for_each_class': {'economist': -57...
422    {'pred_prob_for_each_class': {'economist': -57...
568    {'pred_prob_for_each_class': {'economist': -65...
394    {'pred_prob_for_each_class': {'economist': -65...
223    {'pred_prob_for_each_class': {'economist': -50...
Length: 72, dtype: object
431    {'pred_prob_for_each_class': {'africa': -5115....
205    {'pred_prob_for_each_class': {'africa': -5347....
130    {'pred_prob_for_each_class': {'africa': -8101....
105    {'pred_prob_for_each_class': {'africa': -6166....
283    {'pred_prob_for_each_class': {'africa': -4934....
523    {'pred_prob_for_each_class': {'africa': -5161....
190  

In [23]:
nb.cv_result_list_1

[                              pred_prob_for_each_class     argmax true_value
 451  {'economist': -7537.391599216607, 'onion': -81...  economist  economist
 334  {'economist': -3727.8057845142503, 'onion': -3...  economist  economist
 296  {'economist': -4817.085142393696, 'onion': -52...  economist  economist
 590  {'economist': -5471.688351519057, 'onion': -52...      onion      onion
 461  {'economist': -3968.5026118400806, 'onion': -4...  economist  economist
 ..                                                 ...        ...        ...
 670  {'economist': -6627.910726016992, 'onion': -65...      onion      onion
 104  {'economist': -7267.581198523045, 'onion': -77...  economist  economist
 300  {'economist': -8741.09058925305, 'onion': -939...  economist  economist
 684  {'economist': -3049.4448116990425, 'onion': -2...      onion      onion
 637  {'economist': -4972.766429386997, 'onion': -48...      onion      onion
 
 [72 rows x 3 columns],
                               pred_pr

In [24]:
nb.cv_result_list_2

[                              pred_prob_for_each_class         argmax  \
 451  {'africa': -7980.940790917747, 'asia': -7997.1...         europe   
 334  {'africa': -3832.557523643525, 'asia': -3810.5...           asia   
 296  {'africa': -5023.302882871332, 'asia': -5034.3...         europe   
 461  {'africa': -4048.6043362531254, 'asia': -4083....         europe   
 284  {'africa': -3685.755083900828, 'asia': -3689.0...  latin_america   
 181  {'africa': -14980.803449158193, 'asia': -15080...  north_america   
 480  {'africa': -249.2715086182572, 'asia': -249.61...         europe   
 322  {'africa': -3551.2910393952393, 'asia': -3517....        britain   
 231  {'africa': -5689.298131501833, 'asia': -5725.3...        britain   
 273  {'africa': -2852.12698474802, 'asia': -2835.74...           asia   
 71   {'africa': -5026.5201454730595, 'asia': -5182....         africa   
 192  {'africa': -3065.8460764883275, 'asia': -3092....  north_america   
 437  {'africa': -188.20716742214046, 

# Step3: Evalution

In [25]:
nb.get_matrices()



## confusion_matrices for each test fold in task 1

In [26]:
nb.confusion_matrices_1

[                    true_economist  true_onion
 selected_economist              56           0
 selected_onion                   1          15,
                     true_economist  true_onion
 selected_economist              53           0
 selected_onion                   3          16,
                     true_economist  true_onion
 selected_economist              49           1
 selected_onion                   1          21,
                     true_economist  true_onion
 selected_economist              51           0
 selected_onion                   0          21,
                     true_economist  true_onion
 selected_economist              52           0
 selected_onion                   0          20,
                     true_economist  true_onion
 selected_economist              53           0
 selected_onion                   1          18,
                     true_economist  true_onion
 selected_economist              52           1
 selected_onion                   

In [27]:
nb.confusion_matrices_1[5] # Eg the confusion matrix that uses sixth fold

Unnamed: 0,true_economist,true_onion
selected_economist,53,0
selected_onion,1,18


## confusion_matrices for each test fold in task 2

In [28]:
nb.confusion_matrices_2

[                        true_africa  true_asia  true_britain  true_europe  \
 selected_africa                   1          2             0            1   
 selected_asia                     0          6             0            0   
 selected_britain                  0          3             5            2   
 selected_europe                   0          0             1           11   
 selected_international            0          0             0            0   
 selected_latin_america            0          0             0            0   
 selected_north_america            0          0             0            0   
 
                         true_international  true_latin_america  \
 selected_africa                          1                   1   
 selected_asia                            0                   1   
 selected_britain                         1                   2   
 selected_europe                          0                   1   
 selected_international                

In [29]:
nb.confusion_matrices_2[1] # Eg the confusion matrix that uses sixth fold

Unnamed: 0,true_africa,true_asia,true_britain,true_europe,true_international,true_latin_america,true_north_america
selected_africa,6,0,1,0,0,0,0
selected_asia,0,6,0,0,0,0,0
selected_britain,0,1,8,0,6,0,2
selected_europe,0,1,0,11,3,0,0
selected_international,0,0,1,0,0,0,0
selected_latin_america,0,0,0,0,0,3,0
selected_north_america,0,0,0,0,0,0,4


## recall score for each class for each test fold in task 1

In [30]:
nb.recall_1

[{'economist_recall': 1.0, 'onion_recall': 0.9375},
 {'economist_recall': 1.0, 'onion_recall': 0.8421052631578947},
 {'economist_recall': 0.98, 'onion_recall': 0.9545454545454546},
 {'economist_recall': 1.0, 'onion_recall': 1.0},
 {'economist_recall': 1.0, 'onion_recall': 1.0},
 {'economist_recall': 1.0, 'onion_recall': 0.9473684210526315},
 {'economist_recall': 0.9811320754716981, 'onion_recall': 0.9473684210526315},
 {'economist_recall': 1.0, 'onion_recall': 1.0},
 {'economist_recall': 1.0, 'onion_recall': 1.0},
 {'economist_recall': 1.0, 'onion_recall': 0.9583333333333334}]

## recall score for each class for each test fold in task 1

In [31]:
nb.recall_2

[{'africa_recall': 0.16666666666666666,
  'asia_recall': 0.8571428571428571,
  'britain_recall': 0.29411764705882354,
  'europe_recall': 0.7333333333333333,
  'international_recall': nan,
  'latin_america_recall': 1.0,
  'north_america_recall': 1.0},
 {'africa_recall': 0.8571428571428571,
  'asia_recall': 1.0,
  'britain_recall': 0.47058823529411764,
  'europe_recall': 0.7333333333333333,
  'international_recall': 0.0,
  'latin_america_recall': 1.0,
  'north_america_recall': 1.0},
 {'africa_recall': 0.5714285714285714,
  'asia_recall': 1.0,
  'britain_recall': 0.6,
  'europe_recall': 0.8235294117647058,
  'international_recall': nan,
  'latin_america_recall': 1.0,
  'north_america_recall': 1.0},
 {'africa_recall': 0.7692307692307693,
  'asia_recall': 0.8333333333333334,
  'britain_recall': 0.5294117647058824,
  'europe_recall': 0.7692307692307693,
  'international_recall': nan,
  'latin_america_recall': 1.0,
  'north_america_recall': 1.0},
 {'africa_recall': 1.0,
  'asia_recall': 0.8,


## precision score for each class for each test fold in task 1

In [32]:
nb.precision_1

[{'economist_precision': 0.9824561403508771, 'onion_precision': 1.0},
 {'economist_precision': 0.9464285714285714, 'onion_precision': 1.0},
 {'economist_precision': 0.98, 'onion_precision': 0.9545454545454546},
 {'economist_precision': 1.0, 'onion_precision': 1.0},
 {'economist_precision': 1.0, 'onion_precision': 1.0},
 {'economist_precision': 0.9814814814814815, 'onion_precision': 1.0},
 {'economist_precision': 0.9811320754716981,
  'onion_precision': 0.9473684210526315},
 {'economist_precision': 1.0, 'onion_precision': 1.0},
 {'economist_precision': 1.0, 'onion_precision': 1.0},
 {'economist_precision': 0.9795918367346939, 'onion_precision': 1.0}]

## precision score for each class for each test fold in task 2

In [33]:
nb.precision_2

[{'africa_precision': 1.0,
  'asia_precision': 0.5454545454545454,
  'britain_precision': 0.8333333333333334,
  'europe_precision': 0.7857142857142857,
  'international_precision': 0.0,
  'latin_america_precision': 0.375,
  'north_america_precision': 0.45454545454545453},
 {'africa_precision': 1.0,
  'asia_precision': 0.75,
  'britain_precision': 0.8,
  'europe_precision': 1.0,
  'international_precision': 0.0,
  'latin_america_precision': 1.0,
  'north_america_precision': 0.6666666666666666},
 {'africa_precision': 1.0,
  'asia_precision': 0.875,
  'britain_precision': 1.0,
  'europe_precision': 0.875,
  'international_precision': 0.0,
  'latin_america_precision': 0.6,
  'north_america_precision': 0.6666666666666666},
 {'africa_precision': 1.0,
  'asia_precision': 0.8333333333333334,
  'britain_precision': 1.0,
  'europe_precision': 0.9090909090909091,
  'international_precision': 0.0,
  'latin_america_precision': 0.125,
  'north_america_precision': 0.75},
 {'africa_precision': 0.57142

## final test accuracy for each test fold in task 1

In [34]:
nb.accuracy_1

[0.9861111111111112,
 0.9583333333333334,
 0.9722222222222222,
 1.0,
 1.0,
 0.9861111111111112,
 0.9722222222222222,
 1.0,
 1.0,
 0.9861111111111112]

## final test accuracy for each test fold in task 2

In [35]:
nb.accuracy_2

[0.5849056603773585,
 0.7169811320754716,
 0.7735849056603774,
 0.7169811320754716,
 0.6226415094339622,
 0.6415094339622641,
 0.7358490566037735,
 0.7547169811320755,
 0.7924528301886793,
 0.7358490566037735]

# Step 4: Analyzing the result

## Question：Analyze your results: Are you surprised (positively or negatively) by the results? What did you expect and why? Examine any systematic errors your classifier makes. Can you come up with a plausible explanation?

Answer: I am positively surprised by the results given out by naive bayes algorithm. We can easily observe that our nb_classifier did a better job in classifying journal_type than region_type. That is because doing a multinomial classification is much harder than doing binomial classification. Since multinomial classification include more classes but even less train size (because we remove out onion before doing classification), that will inevitably influence the accuracy score. 

My classifier made classification error when predicting an article which its true region_type is international. that is because the imbalanced sample we had. In all the economist journals, the international articles only takes account a very samll part of it. This will incur a huge problem when we try to compute both class probabilities and conditional probability of each word given class is international. 

In [36]:
def test_nb_algo(data_df,objective_classes,objective_type):
        '''
        The main algorithm of naive bayes
        '''
        class_probs = {}
        train_set = data_df
        data = np.ones(len(data_df.columns))
        test = pd.DataFrame(data.reshape(1,-1),columns = data_df.columns)
        Vocab_size = len(data_df.columns) - 3
        for objective_class in objective_classes:        
            Prob_is_class = train_set.loc[train_set[objective_type] == objective_class].shape[0]/train_set.shape[0]
            class_probs[objective_class] = Prob_is_class
        #print(class_probs)    
        def helper_func(x):
            new_x = x.drop(columns = ["journal_type","region_type","article"])
            words_in_test = new_x[new_x.notnull()].columns.tolist()
            pred_prob_for_each_class = {}
            largest_weight_for_each_class = {}
            smallest_weight_for_each_class = {}
            max_record = float("-inf")
            argmax = objective_classes[0]
            
            
            for objective_class in objective_classes:
                log_prob = 0 #initialize the log_prob to be log 1 which is set to be 0
                log_prob += np.log(class_probs[objective_class])   
                objected_train_set = train_set.loc[train_set[objective_type] == objective_class]
                denominator = objected_train_set.drop(columns = ["journal_type","region_type","article"]).sum().sum() + Vocab_size
                conditional_probs = (objected_train_set[words_in_test].sum(axis = 0).add(1)).divide(denominator)
                conditional_probs = np.log(conditional_probs) * new_x[new_x.notnull()]
                conditional_probs = conditional_probs.rename(index={0: "conditional_prob"})
                nlargest = conditional_probs.T.nlargest(5,"conditional_prob")
                nsmallest = conditional_probs.T.nsmallest(5,"conditional_prob")
                largest_weight_for_each_class[objective_class] = nlargest
                smallest_weight_for_each_class[objective_class] = nsmallest
                
                log_prob += conditional_probs.sum().sum()
                
                pred_prob_for_each_class[objective_class] = log_prob
                if (max_record != max(max_record,log_prob)):
                    argmax = objective_class
                    max_record = log_prob
            
            #x[objective_type + "pred_probs"] = pred_prob_for_each_class
            #x[objective_type + "pred_result"] = argmax
            return {"largest_weight_for_each_class":largest_weight_for_each_class, "smallest_weight_for_each_class":smallest_weight_for_each_class}  
            
        pred = helper_func(test)
        
        
        return pred

In [37]:
objective_classes_1 = ["economist","onion"]
objective_type_1 = "journal_type"
test1 = test_nb_algo(nb.shuffled_df,objective_classes_1,objective_type_1)

objective_type_2 = "region_type"
objective_classes_2 = ["africa","asia","britain","europe","international","latin_america","north_america"]
test2 = test_nb_algo(nb.shuffled_df,objective_classes_2,objective_type_2)

In [38]:
test1 # since we set the test txt to be whole vocab list each vocab appears once and It is totally made up, so we do not have a true value

{'largest_weight_for_each_class': {'economist':      conditional_prob
  the         -3.076424
  of          -3.679851
  to          -3.718651
  a           -3.911538
  in          -4.057912,
  'onion':      conditional_prob
  the         -3.420985
  to          -3.960761
  of          -4.054697
  a           -4.260989
  and         -4.290763},
 'smallest_weight_for_each_class': {'economist':                    conditional_prob
  Expert                   -13.086251
  Me                       -13.086251
  Dustbuster               -13.086251
  asleep.                  -13.086251
  two-by-three-foot        -13.086251,
  'onion':        conditional_prob
  fleet          -12.1594
  AP             -12.1594
  Mr             -12.1594
  mix.           -12.1594
  Iran.          -12.1594}}

In [39]:
test2

{'largest_weight_for_each_class': {'africa':      conditional_prob
  the         -3.608008
  of          -4.210771
  to          -4.324937
  a           -4.556843
  in          -4.594730,
  'asia':      conditional_prob
  the         -3.575473
  of          -4.215794
  to          -4.280194
  a           -4.421676
  in          -4.580520,
  'britain':      conditional_prob
  the         -3.583194
  of          -4.115016
  to          -4.142749
  a           -4.366745
  and         -4.503022,
  'europe':      conditional_prob
  the         -3.523930
  of          -4.207680
  to          -4.224609
  a           -4.405956
  in          -4.570196,
  'international':      conditional_prob
  the         -3.862915
  of          -4.368702
  to          -4.458786
  in          -4.739943
  a           -4.756732,
  'latin_america':      conditional_prob
  the         -3.818793
  to          -4.435852
  of          -4.444462
  a           -4.646794
  in          -4.814023,
  'north_america':      

## Question: Analyze your model: Train on the full set of data and look at the weights assigned to the words in your model. For each class, list the five words the model says one is most likely to observe, given that class. Do you think these words are most representative of the class? If not, list five words you feel are more representative of that class, and explain your choice. Why might these lists of words be different? Are some types of words more useful for distinguishing between classes than others? Do the same analysis for the least probable words, given the class.

Answer: obviously, the words that the model says one is most likely to observe are the most common words that will appear in any classes of articles. These words say almost say nothing about what category the whole article should belongs to. In order to get more useful words and offset the frequency disadvantage of those useful words, we can penalize each word by it appear frequency somehow. It should be implemented as a hyperparameter, which needs to be tuned in trials. This is because the frequency do have some power in determine which class the article belongs to. The five word for Economist articles I will give out is finance,weapon,politic,stock,government, while Onions' five words should be game, entertainment, trip, brand, fashion. Those words might not appear the most frequently in an article, but they are words that represent the style of articles from different classes. The five least probable words also give no clue about what classes should the article belongs to. Most of them are just specific numerical numbers or human names.