In [1]:
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
models_path = "Models/"

svm_path = models_path + 'best_svc.pickle'

with open(svm_path, 'rb') as data:
    svc_model = pickle.load(data)

In [3]:
tfidf_path = "Pickles/tfidf.pickle"
with open(tfidf_path, 'rb') as data:
    tfidf = pickle.load(data)


In [4]:

category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

In [9]:
punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

def create_features_from_text(text):
    
    # Dataframe creation
    lemmatized_text_list = []
    df = pd.DataFrame(columns=['Content'])
    df.loc[0] = text
    df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
    df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
    df['Content_Parsed_3'] = df['Content_Parsed_2']
    for punct_sign in punctuation_signs:
        df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
    df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    text = df.loc[0]['Content_Parsed_4']
    text_words = text.split(" ")
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
    lemmatized_text = " ".join(lemmatized_list)    
    lemmatized_text_list.append(lemmatized_text)
    df['Content_Parsed_5'] = lemmatized_text_list
    df['Content_Parsed_6'] = df['Content_Parsed_5']
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')
    df = df['Content_Parsed_6']
    df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})
    
    # TF-IDF
    features = tfidf.transform(df).toarray()
    
    return features

In [10]:

def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

In [11]:
def predict_from_text(text):
    
    # Predict using the input model
    prediction_svc = svc_model.predict(create_features_from_text(text))[0]
    prediction_svc_proba = svc_model.predict_proba(create_features_from_text(text))[0]
    
    # Return result
    category_svc = get_category_name(prediction_svc)
    
    print("The predicted category using the SVM model is %s." %(category_svc) )
    print("The conditional probability is: %a" %(prediction_svc_proba.max()*100))

In [12]:
text = """

The center-right party Ciudadanos closed a deal on Wednesday with the support of the conservative Popular Party (PP) to take control of the speaker’s committee in the Andalusian parliament, paving the way for the regional PP leader, Juan Manuel Moreno, to stand as the candidate for premier of the southern Spanish region. The move would see the Socialist Party (PSOE) lose power in the Junta, as the regional government is known, for the first time in 36 years.

Talks in Andalusia have been ongoing since regional polls were held on December 2. The PSOE, led by incumbent premier Susana Díaz, had been expected to win the early elections, but in a shock result the party took the most seats in parliament, 33, but fell well short of a majority of 55. It was their worst result in the region since Spain returned to democracy. The PP came in second, with 26 seats, while Ciudadanos were third with 21. The major surprise was the strong performance of far-right group Vox, which won more than 391,000 votes (10.9%), giving it 12 deputies. The anti-immigration group is the first of its kind to win seats in a Spanish parliament since the end of the Francisco Franco dictatorship. It now holds the key to power in Andalusia, given that its votes, added to those of the PP and Ciudadanos, constitute an absolute majority.

The move would see the Socialist Party lose power in the region for the first time in 36 years

On Thursday, Marta Bosquet of Ciudadanos was voted in as the new speaker of the Andalusian parliament thanks to 59 votes from her party, the PP and Vox. The other candidate, Inmaculada Nieto of Adelante Andalucía, secured 50 votes – from her own party and 33 from the PSOE.

The speaker’s role in the parliament is key for the calling of an investiture vote and for the selection of the candidate for premier.

Officially, the talks as to the make up of a future government have yet to start, but in reality they are well advanced, according to sources from both the PP and Ciudadanos. The leader of the Andalusian PP is banking on being voted into power around January 16 and wants the majority of his Cabinet to be decided “five days before the investiture vote.”

The speaker’s role in the parliament is key for the calling of an investiture vote and for the selection of the candidate for premier

The PP, which was ousted from power by the PSOE in the national government in June, is keen to take the reins of power in Andalusia as soon as possible. The difficulties that Ciudadanos has faced to justify the necessary inclusion of Vox in the talks, has slowed down progress. Rather than align itself with the far right party, the group – which began life in Catalonia in response to the independence drive, but soon launched onto the national stage – had sought a deal with Adelante Andalucía.

Wednesday was a day of intense talks among the parties in a bid to find a solution that would keep everyone happy. But at 9pm last night, Adelante Andalucía announced that it would not be part of “any deal” and that would instead vote for its own candidates to the speaker’s committee in order to “face up to the right wing and the extreme right.”

The PSOE, meanwhile, argues that having won the elections with a seven-seat lead over the PP gives it the legitimacy to aspire to the control of the regional government and the parliament, and to maintain its positions on the speaker’s committee.



"""


In [13]:
predict_from_text(text)

The predicted category using the SVM model is politics.
The conditional probability is: 92.39503972744953


In [14]:
text="""
The restaurateur, model and author B. Smith died on Saturday after a battle with Alzheimer's disease, her husband Dan Gasby said in a statement. She was 70 years old.

"Thank you to all the friends and fans who supported B. and our family during her journey," Gasby said. "Thank you to everyone for respecting our privacy during this agonizing time."
"Heaven is shining even brighter now that it is graced with B.'s dazzling and unforgettable smile."
Smith, whose full name was Barbara Elaine Smith, was a multi-faceted and multi-talented personality.
In addition to building restaurants and a home decor collection, over the years, Smith was a fashion model, actress, TV host and bestselling cookbook author. In 1976 she became one of the first African American women to appear on the cover of "Mademoiselle" magazine.
Smith was diagnosed with early-onset Alzheimer's at the age of 64 in 2013.
"It feels like crying," she told CBS in a 2014 interview. "Things like that make me very sad."
But she was also determined to raise awareness of Alzheimer's, particularly for the African American community.
"We lost legendary fashion model, chef, restaurateur, lifestyle icon and magazine publisher, B Smith today," NBC's Al Roker said on Twitter.

"""

In [15]:
predict_from_text(text)

The predicted category using the SVM model is entertainment.
The conditional probability is: 93.92200108298984


In [16]:
text="""

Alabama spent much of the 2020 recruiting cycle in hot pursuit of 5-star tight ends Arik Gilbert and Darnell Washington.

Nick Saban and Co. failed.

The 6-foot-5, 250-pound Gilbert – from suburban Atlanta and the No. 1-ranked player on AL.com’s The Southern 120 – picked LSU over Alabama and Georgia. Washington, a 6-foot-7 athlete from Las Vegas, picked Georgia over Alabama, Miami and Tennessee.

It was clear throughout the recruiting cycle, though, that Alabama desperately wanted to bring in a plug-and-play tight end after last year’s tight end group combined for just 21 catches.

Alabama briefly flirted with 6-foot-7 McKinney North (Texas)’s Brandon Frazier – who ended up signing with Auburn – before abandoning January recruiting of high school tight ends.

The Crimson Tide simply turned to the transfer (free agent) portal and nabbed North Carolina grad transfer Carl Tucker, a 6-foot-2, 248-pounder who started 20 career games and caught a career-best 16 passes in the Tar Heels’ ‘Air Raid’ offense in 2018.

Why take an unproven 18-year-old over Tucker, who was initially part of the 2015 recruiting cycle and has had nearly five years to get bigger, stronger and mentally strong enough to play in the rough-and-tumble SEC.

The funny thing is Alabama actually signed another tight end – Archbishop Hoban (Ohio)’s Caden Clark, a 6-foot-3½, 245-pound 3-star prospect – but the Tide’s pursuit of others suggests he’s more a development player than ready to provide plug-and-play immediate help.

Alabama produced first-round NFL draft pick O.J. Howard in 2017 and second-round pick Irv Smith in 2019, but there’s no one of comparable talent currently on the roster.

With a new starting quarterback leading the offense 2020, it’s easy to see how and why a reliable and explosive pass-catching tight end would be a welcome security blanket for Mac Jones, Bryce Young or whoever ultimately wins the job.
"""

In [17]:
predict_from_text(text)

The predicted category using the SVM model is sport.
The conditional probability is: 89.84072579967938


In [18]:
text ="""
Senator Bernie Sanders claimed a major victory in the Nevada caucuses on Saturday that demonstrated his broad appeal in the first racially diverse state in the presidential primary race and established him as the clear front-runner for the Democratic nomination.

In a significant show of force, Mr. Sanders, a liberal from Vermont, had a lead that was more than double his nearest rivals with 50 percent of the precincts reporting, and The Associated Press named him the winner on Saturday evening.

His triumph in Nevada, after strong performances in Iowa and New Hampshire, will propel him into next Saturday’s primary in South Carolina, and the Super Tuesday contests immediately thereafter, with a burst of momentum that may make it difficult for the still-fractured moderate wing of the party to slow his march.

Mr. Sanders, speaking to jubilant supporters in San Antonio, trumpeted what early results suggested would be a landslide victory.

"""

In [19]:
predict_from_text(text)

The predicted category using the SVM model is politics.
The conditional probability is: 73.18641232211323
