In [2]:
import pandas as pd
import time
from datetime import datetime
from datetime import date
from datetime import timedelta
import matplotlib.pyplot as plt

#read csv file
def readCsvFile(path):
    df = pd.read_csv(path, encoding='utf-8')
    input_df = df.filter(['id', 'published', 'type', 'category'])
    return input_df

#cleaning published column
def cleanPublished(input_df):
    input_df = input_df[input_df['published'].notna()]
    timestamps = []
    for i in range(len(input_df['published'])):
        if(" " in input_df.iloc[i]['published']):
            d,t = input_df.iloc[i]['published'].split(" ")
            timestamps.append(d)  
        else:
            timestamps.append(input_df.iloc[i]['published']) 
    input_df['published'] = timestamps
    return input_df

# returns DF with articles within window
def lookbackWindowDF(windowSize, currentDate, input_df):
    current = datetime.strptime(currentDate,"%Y-%m-%d")
    oldest = current - timedelta(days=windowSize)
    ids, published, types, category = [],[],[],[]
    output_df = pd.DataFrame()
    for i in range(len(input_df['published'])):
        sample = datetime.strptime(input_df.iloc[i]['published'],"%Y-%m-%d")
        if(sample >= oldest and sample <= current):
            ids.append(input_df.iloc[i]['id'])
            published.append(input_df.iloc[i]['published'])
            types.append(input_df.iloc[i]['type'])
            category.append(input_df.iloc[i]['category'])
    output_df['id']=ids
    output_df['published']=published
    output_df['type']=types
    output_df['category']=category
    return output_df

# returns fraction of fake news in a particular category
def fakenessOfCategoryDF(input_df,categoryName):
    if(input_df.empty==True):
        return 0.0
    category_df = input_df[input_df['category']==categoryName]
    if(category_df.empty==True):
        return 0.0
    fake_df = category_df[category_df['type']=='fake']
    if(fake_df.empty == True):
        return 0.0
    categoryCount, temp1 = category_df.shape
    fake_count, temp2 = fake_df.shape
    return fake_count/categoryCount

# returns the fraction of news which fall under particular category
def categoryFractionDF(input_df, categoryName):
    if(input_df.empty==True):
        return 0.0
    category_df = input_df[input_df['category']==categoryName]
    if(category_df.empty == True):
        return 0.0
    categoryCount, temp1 = category_df.shape
    totalCount, temp2 = input_df.shape
    return categoryCount/totalCount


In [3]:
def lookbackWindow(windowSize, currentDate, input_df, columnName):
    current = datetime.strptime(currentDate,"%Y-%m-%d")
    oldest = current - timedelta(days=windowSize)
    trending = []
    for i in range(len(input_df['published'])):
        sample = datetime.strptime(input_df.iloc[i]['published'],"%Y-%m-%d")
        if(sample >= oldest and sample <= current):
            trending.append(input_df.iloc[i][columnName])
    return trending

# predecessor of fakenessOfCategoryDF method (for reference only)
def fakenessOfCategory(windowSize, currentDate, input_df, categoryName):
    current = datetime.strptime(currentDate,"%Y-%m-%d")
    oldest = current - timedelta(days=windowSize)
    fakenessFraction = 0
    totalCount = 0
    for i in range(len(input_df['published'])):
        sample = datetime.strptime(input_df.iloc[i]['published'],"%Y-%m-%d")
        if(sample >= oldest and sample < current and input_df.iloc[i]['category']==categoryName):
            totalCount += 1
            if(input_df.iloc[i]['type']=='fake'):
                fakenessFraction += 1
    if(totalCount==0):
        return totalCount
    return fakenessFraction/totalCount

# for calculating fractions and identifying trending category.
def categoryFraction(trending):
    totalCount = len(trending)
    fractionDict = {
        'business':0,
        'entertainment':0,
        'politics':0,
        'sport':0,
        'tech':0,
        'others':0
    }
    if(totalCount==0):
        return fractionDict
    for i in trending:
        if i in fractionDict.keys():
            fractionDict[i] += 1
        else:
            fractionDict['others'] += 1
    for j in fractionDict.keys():
        fractionDict[j] /= totalCount
    return fractionDict

def fakenessFraction(trending):
    totalCount = len(trending)
    fractionDict = {
        'real':0,
        'fake':0
    }
    if(totalCount==0):
        return fractionDict
    for i in trending:
        fractionDict[i] += 1
    for i in fractionDict.keys():
        fractionDict[i] /= totalCount
    return fractionDict


In [4]:
#returns the oldest and latest date of publish among articles present in dataset
def minmaxPublishedDate(input_df):
    minDate = input_df['published'].min()
    maxDate = input_df['published'].max()
    return [minDate, maxDate]

# Applying lookdownWindowDF on entire dataset for generating analysis
def slidingWindow(windowSize,minDate,maxDate,input_df,categoryName):
    minValue = datetime.strptime(minDate,"%Y-%m-%d")
    currentDate = maxDate
    current = datetime.strptime(currentDate,"%Y-%m-%d")
    limit = minValue + timedelta(days=windowSize)
    results = []
    while(current>=minValue):
        currentDate = datetime.strftime(current,"%Y-%m-%d")
        sub_df = lookbackWindowDF(windowSize, currentDate, input_df)
        fraction = categoryFractionDF(sub_df, categoryName)
        results.append([fraction,fakenessOfCategoryDF(sub_df, categoryName)])
        current = current - timedelta(days=windowSize)
    return results

In [5]:
#Plotting charts
def drawPlot(X,Y,plotType,windowSize,categoryName):
    fig, ax = plt.subplots()
    if(plotType=='scatter'):
        ax.scatter(X, Y, label='Trend')
    else:
        ax.plot(X, Y, label='Trend')
    # Add some text for labels, title and custom x-axis labels, etc.
    ax.set_xlabel('Trend Fraction')
    ax.set_ylabel('Fake Fraction')
    ax.set_title('Fake Fraction v/s Trend Fraction for '+categoryName+' category')
    ax.legend()
    fig.tight_layout()
    #plt.show()
    filename = categoryName+plotType+str(windowSize)+'.png'
    fig.savefig(filename, dpi = 400)


In [13]:
path = '../Datasets/Working_Data/all_data_refined_v2.csv'
input_df = readCsvFile(path)
input_df = cleanPublished(input_df)
currentDate = "2016-11-15"
windowSize = 7
categoryName = "politics"
minDate,maxDate=minmaxPublishedDate(input_df)
correlations = dict()

for i in ['business', 'politics', 'sport', 'entertainment', 'tech']:
    for j in [3,5,7,9]:
        points = slidingWindow(j,minDate,maxDate,input_df,i)
        X,Y=[],[]
        for point in points:
            X.append(point[0])
            Y.append(point[1])
        correlation_df = pd.DataFrame()
        correlation_df['fakeFraction']=X
        correlation_df['trendFraction']=Y
        correlations[i+str(j)]=correlation_df['fakeFraction'].corr(correlation_df['trendFraction'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyboardInterrupt: 

In [14]:
correlations

{'business3': 0.0818714316665238,
 'business5': 0.02118969764912603,
 'business7': 0.20295010007567366,
 'business9': 0.36713817846806684,
 'politics3': -0.9377701488040342,
 'politics5': -0.9411239466460215,
 'politics7': -0.9432080717623358,
 'politics9': -0.940521447831278,
 'sport3': 0.7829661544097727,
 'sport5': 0.7953040175524619,
 'sport7': 0.9639752625271658,
 'sport9': 0.9026004789053418,
 'entertainment3': -0.08143533063932583,
 'entertainment5': 0.17612041772950768,
 'entertainment7': -0.2477528802333515,
 'entertainment9': -0.56528494168596}

In [15]:
for i in [3,5,7,9]:
    points = slidingWindow(i,minDate,maxDate,input_df,'tech')
    X,Y=[],[]
    for point in points:
        X.append(point[0])
        Y.append(point[1])
    correlation_df = pd.DataFrame()
    correlation_df['fakeFraction']=X
    correlation_df['trendFraction']=Y
    print(correlation_df['fakeFraction'].corr(correlation_df['trendFraction']))

0.8865562057724649
0.9010388712159485
0.825853803983408
0.8612431733963276


In [6]:
correlation_df = pd.DataFrame()


In [8]:
correlation_df['fakeFraction'] = [0.2727272727272727, 0.189873417721519, 0.1935483870967742, 0.16188524590163936, 0.13682092555331993, 0.15155807365439095, 0.1392904073587385, 0.10881652104845115, 0.10244648318042814, 0.11143552311435523, 0.12171561051004637, 0.11321206318812829, 0.1419120079391333, 0.06818181818181818, 0.07079646017699115, 0.10256410256410256, 0.060240963855421686, 0.041666666666666664, 0.08196721311475409, 0.203125, 0.17543859649122806, 0.09090909090909091]
correlation_df['trendFraction'] = [0.0, 0.0, 0.803030303030303, 0.8734177215189873, 0.7794117647058824, 0.8130841121495327, 0.7641509433962265, 0.8029197080291971, 0.9626865671641791, 0.9650655021834061, 0.9746031746031746, 0.9915433403805497, 0.9906759906759907, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [12]:
correlation_df['fakeFraction'].corr(correlation_df['trendFraction'])

0.0818714316665238

NameError: name 'correlation' is not defined