In [3]:
# This scripts extracts RSS-feed from the online News-sites
# it is then cleans and structures them to be imported by another script

# Import packages
import feedparser


################################ RSS FEED Parser #####################################

RSS_URLS = ['http://www.dn.se/nyheter/m/rss/',
            'https://rss.aftonbladet.se/rss2/small/pages/sections/senastenytt/', 'https://feeds.expressen.se/nyheter/',
            'http://www.svd.se/?service=rss', 'http://api.sr.se/api/rss/program/83?format=145',
            'http://www.svt.se/nyheter/rss.xml'
              ]

posts = []

for url in RSS_URLS:
    posts.extend(feedparser.parse(url).entries)

######################################################################################



##################### Extracting the titles and summeries from the dataset ##################

def OnlyTitlesandSumaries():
    only_titles_and_summaries = []
    for x in posts:
        try:
            tempdict = {}
            tempdict["title"] = x["title"]
            tempdict["summary"] = x["summary"]
            only_titles_and_summaries.append(tempdict)
        except KeyError as ke:
            only_titles_and_summaries.append("") #replace the missing keys with empty space
    return only_titles_and_summaries

Only_the_titles_Summaries = OnlyTitlesandSumaries()


def TitleAndSummaryList():
    title_and_summary_list = []
    temp_and_summary_title_list = []
    for x in Only_the_titles_Summaries:
        for key in x:
            if 'title' == key:
                firstkey = x[key]
            if 'summary' == key:
                secondkey = x[key]
                temp_and_summary_title_list.append(firstkey + ' ' + secondkey)
        title_and_summary_list.append(temp_and_summary_title_list)
        temp_and_summary_title_list = []
    return title_and_summary_list

The_Title_Summary_List = TitleAndSummaryList()


#print(The_Title_Summary_List)
######################################################################################



##################### Concatenating the list of Titles into a single list  ##################

def PrintDeposit():
    newList= []
    for item in The_Title_Summary_List:
        for value in item:
            newList.append(value)
    return newList

printdepositlist = PrintDeposit()


printdepositlistNoEmpty = []

for item in printdepositlist:
    if item != ' ':
        printdepositlistNoEmpty.append(item)


print(len(printdepositlist))
print(len(printdepositlistNoEmpty))
######################################################################################

207
206


In [5]:
# This is a Machine Learning script that uses pre-labeled and pre-processed 
# data to train a ML algorithm to run a multi-label classification task

# Import all the neccecary packages for the program
import re
import sys
import warnings
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.multiclass import OneVsRestClassifier
#from RssFeedNewArticle import PrintDeposit #transfer your own list of pre-processed data from another Python Script



################################# Import your pre-label data #################################

data_path = "/Users/hannesyilmaz/Desktop/Portofolio/Text-Classification/Book1.csv"

data_raw = pd.read_csv(data_path)

data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=len(data_raw))]
#data.shape

###############################################################################################



if not sys.warnoptions:
    warnings.simplefilter("ignore")



################################# Check for the categories of Data #################################

categories = list(data_raw.columns.values)
categories = categories[2:]
print(categories)

###############################################################################################



################################# Clean the Data of non-numero-alphabetic symbols #################################

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z wåäöÅÄÖ]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

#####################################################################################################################



################################# Getting rid of stopwords and stemming the lexemes #################################

nltk.download('stopwords')

stop_words = set(stopwords.words('swedish'))
stop_words.update(['noll','ett','två','tre','fyra','fem','sex','sju','åtta','nio','tio','kunna','också','över','bland','förutom','hursom','än','inom'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['Heading'] = data['Heading'].apply(removeStopWords)
data.head()

stemmer = SnowballStemmer("swedish")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['Heading'] = data['Heading'].apply(stemming)
data.head()

#####################################################################################################################



################################# Splitting the data into training and testing chunks #################################

train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

#print(train.shape)
#kprint(test.shape)

train_text = train['Heading']
test_text = test['Heading']

########################################################################################################################



################################# Defining my imported pre-labeled data set variable  #################################

my_text = PrintDeposit()

my_text_no_empty = []

print("my_text:", len(my_text))
print("my_text type:", type(my_text))
print("my_text_no_empty:", len(my_text_no_empty))

for item in my_text:
    if item != ' ':
        my_text_no_empty.append(item)

print("my_text_no_empty:", len(my_text_no_empty))

#######################################################################################################################



################################# Creating text vectors for the train and test dataset  #################################

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['Id','Heading'], axis=1)

x_test = vectorizer.transform(my_text) #For single case (your own sample text) checking
y_test = test.drop(labels = ['Id','Heading'], axis=1)

#######################################################################################################################



################################# Setting up ML algorithm  ###################################################

# DecisionTreeRegressor has heighest accuracy atm
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(DecisionTreeRegressor())),
            ])


dicts = []
print("my_text: ", len(my_text))
for category in categories:
    print('**Processing {} articles...**'.format(category))
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    counter = 0
    n_counter = []
    for text in x_test:
        prediction = LogReg_pipeline.predict(text)
            #print(type(prediction))
        for pred in np.nditer(prediction):
            #print('Predicted as {}'.format(pred)) #Your own sample data test
            #print("\n")
            actual_text = my_text[counter]
            counter +=1
            
            tempDict = {}
            if pred == 1:
                #for i in range((len(my_text) - 1)):
                tempDict[actual_text] = category # Move them into a temporary dictionary (dict)
                dicts.append(tempDict) # Then append them to the main list of dictionary
            else:
                tempDict[actual_text] = "empty"
                dicts.append(tempDict)

#print(dicts)


###############################################################################################################



################################# Reduce the duplication of keys and append labels(values) to each key ###################################################




new_dicts = defaultdict(list)
print("dicts before: ", len(dicts))

for d in dicts:
    for k, v in d.items():
        new_dicts[k].append(v)

'''
dictionary_list = []

for d in dicts:
    for key, value in d.items():
        if key not in d:
            dictionary_list[key] = []
        dictionary_list[key].append(value)

print(len(dictionary_list))
'''





#print("new_dicts text: ", list(new_dicts.keys())[0])
#print("my_text text: ", my_text[0])

print("dicts: ", (len(dicts)))
print("new_dicts: ", len(new_dicts))
print("new_dicts items: ", len(new_dicts.items()))

print("dicts type: ", type(dicts))
print("new_dicts type: ", type(new_dicts))

print(new_dicts)

##############################################################################################################################################################

['Politik', 'Utbildning', 'Religion', 'Miljo', 'Ekonomi', 'LivsstilFritt', 'SamhalleKonflikter', 'Halsa', 'Idrott', 'VetenskapTeknik']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannesyilmaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


my_text: 207
my_text type: <class 'list'>
my_text_no_empty: 0
my_text_no_empty: 206
my_text:  207
**Processing Politik articles...**
**Processing Utbildning articles...**
**Processing Religion articles...**
**Processing Miljo articles...**
**Processing Ekonomi articles...**
**Processing LivsstilFritt articles...**
**Processing SamhalleKonflikter articles...**
**Processing Halsa articles...**
**Processing Idrott articles...**
**Processing VetenskapTeknik articles...**
dicts before:  2070
dicts:  2070
new_dicts:  207
new_dicts items:  207
dicts type:  <class 'list'>
new_dicts type:  <class 'collections.defaultdict'>
defaultdict(<class 'list'>, {'Bopriserna fortsätter att rasa i Stockholm Fjärde månaden i rad med sjunkande priser.': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], '”Försvarsmakten är inte Peter Hultqvists egen leksaksarmé” ': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], 'Geoff Dyer z

In [6]:
newAlist = []
for i in my_text:
    for k, v in new_dicts.items():
        if i == k:
            newAlist.append(i)
            newAlist.append(v)

In [7]:
print(newAlist)

['Bopriserna fortsätter att rasa i Stockholm Fjärde månaden i rad med sjunkande priser.', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], '”Försvarsmakten är inte Peter Hultqvists egen leksaksarmé” ', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], 'Geoff Dyer zoomar in mästarnas sista strid ', ['empty', 'empty', 'empty', 'empty', 'empty', 'LivsstilFritt', 'empty', 'empty', 'empty', 'empty'], 'Emanuel Örtengren: Gubbarna fick fel – damfotbollen kan konkurrera på egna meriter ', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'SamhalleKonflikter', 'empty', 'empty', 'empty'], '”Vi har ändrat förutsättningarna för kvinnor i England” ', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'SamhalleKonflikter', 'empty', 'empty', 'empty'], 'Myanmars junta förlänger undantagstillståndet ', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], 'Fernando Alonso 

In [26]:
def onlyCategories(newAlist):
    second_values = []

    for index in range(1, len(newAlist), 2):
        second_values.append(newAlist[index])

    return second_values 

In [29]:
onlyCategoryList = onlyCategories(newAlist)

In [30]:
print(len(onlyCategoryList))

207


In [15]:
print(len(newAlist))

414


In [16]:
item = iter(newAlist)

In [17]:
ds = dict(zip(item, item))

In [18]:
print(ds)

{'Bopriserna fortsätter att rasa i Stockholm Fjärde månaden i rad med sjunkande priser.': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], '”Försvarsmakten är inte Peter Hultqvists egen leksaksarmé” ': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], 'Geoff Dyer zoomar in mästarnas sista strid ': ['empty', 'empty', 'empty', 'empty', 'empty', 'LivsstilFritt', 'empty', 'empty', 'empty', 'empty'], 'Emanuel Örtengren: Gubbarna fick fel – damfotbollen kan konkurrera på egna meriter ': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'SamhalleKonflikter', 'empty', 'empty', 'empty'], '”Vi har ändrat förutsättningarna för kvinnor i England” ': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'SamhalleKonflikter', 'empty', 'empty', 'empty'], 'Myanmars junta förlänger undantagstillståndet ': ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty'], 'Fernando Alonso 

In [19]:
print(len(ds))

207


In [20]:
allitems = []


for x in posts:
    tempdict = {}
    tempdict["title"] = x["title"]
    tempdict["summary"] = x["summary"]
    tempdict["link"] = x["link"]
    allitems.append(tempdict)

#print(allitems)


finalList = []
tempList = []
key1 = "title"
key2 = "summary"
key3 = "link"

for x in allitems:
    for key in x:
        if key1 == key:
            tempList.append(x[key])
        if key2 == key:
            tempList.append(x[key])
        if key3 == key:
            tempList.append(x[key])
    finalList.append(tempList)
    tempList = []

print(len(finalList))

207


In [21]:
print(finalList)

[['Bopriserna fortsätter att rasa i Stockholm', 'Fjärde månaden i rad med sjunkande priser.', 'https://www.dn.se/ekonomi/bopriserna-fortsatter-att-rasa-i-stockholm/'], ['”Försvarsmakten är inte Peter Hultqvists egen leksaksarmé”', '', 'https://www.dn.se/debatt/forsvarsmakten-ar-inte-peter-hultqvists-egen-leksaksarme/'], ['Geoff Dyer zoomar in mästarnas sista strid', '', 'https://www.dn.se/kultur/geoff-dyer-zoomar-in-mastarnas-sista-strid/'], ['Emanuel Örtengren: Gubbarna fick fel – damfotbollen kan konkurrera på egna meriter', '', 'https://www.dn.se/ledare/emanuel-ortengren-gubbarna-fick-fel-damfotbollen-kan-konkurrera-pa-egna-meriter/'], ['”Vi har ändrat förutsättningarna för kvinnor i England”', '', 'https://www.dn.se/sport/vi-har-andrat-forutsattningarna-for-kvinnor-i-england/'], ['Myanmars junta förlänger undantagstillståndet', '', 'https://www.dn.se/varlden/myanmars-junta-forlanger-undantagstillstandet/'], ['Fernando Alonso klar för Aston Martin', '', 'https://www.dn.se/sport/fern

In [22]:
print(len(finalList))

207


In [57]:
TotalLists = [a+[x] for a,x in zip(finalList, onlyCategoryList)]

In [66]:
print(TotalLists)

[['Bopriserna fortsätter att rasa i Stockholm', 'Fjärde månaden i rad med sjunkande priser.', 'https://www.dn.se/ekonomi/bopriserna-fortsatter-att-rasa-i-stockholm/', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty']], ['”Försvarsmakten är inte Peter Hultqvists egen leksaksarmé”', '', 'https://www.dn.se/debatt/forsvarsmakten-ar-inte-peter-hultqvists-egen-leksaksarme/', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'empty']], ['Geoff Dyer zoomar in mästarnas sista strid', '', 'https://www.dn.se/kultur/geoff-dyer-zoomar-in-mastarnas-sista-strid/', ['empty', 'empty', 'empty', 'empty', 'empty', 'LivsstilFritt', 'empty', 'empty', 'empty', 'empty']], ['Emanuel Örtengren: Gubbarna fick fel – damfotbollen kan konkurrera på egna meriter', '', 'https://www.dn.se/ledare/emanuel-ortengren-gubbarna-fick-fel-damfotbollen-kan-konkurrera-pa-egna-meriter/', ['empty', 'empty', 'empty', 'empty', 'empty', 'empty', 'SamhalleKonfli

In [60]:
print(len(TotalLists))

207
