In [47]:
using Random
Random.seed!(13)

using TextAnalysis
using CSV
using Plots
using DataFrames
using Statistics
using Missings
using Distributions
using StatsPlots
using LowRankModels, LinearAlgebra

In [48]:
include("proxgrad.jl")
df = CSV.read("airline.csv")

Unnamed: 0_level_0,airline_name,link,title,author
Unnamed: 0_level_1,String,String,String,String
1,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,D Ito
2,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Ron Kuhlmann
3,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,E Albin
4,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Tercon Bojan
5,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,L James
6,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Dzano Edin
7,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,M Tushishvili
8,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,M Znidar
9,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,Tercon Bojan
10,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,M Znidar


In [49]:
# Organize and display features along with variable type
feature_names = names(df)
for i in 1:20
    println(string(i), "\t", string(feature_names[i]), "\t\t\t", string(eltype(df[!, i])))
end

1	airline_name			String
2	link			String
3	title			String
4	author			String
5	author_country			Union{Missing, String}
6	date			Dates.Date
7	content			String
8	aircraft			Union{Missing, String}
9	type_traveller			Union{Missing, String}
10	cabin_flown			Union{Missing, String}
11	route			Union{Missing, String}
12	overall_rating			Union{Missing, Float64}
13	seat_comfort_rating			Union{Missing, Float64}
14	cabin_staff_rating			Union{Missing, Float64}
15	food_beverages_rating			Union{Missing, Float64}
16	inflight_entertainment_rating			Union{Missing, Float64}
17	ground_service_rating			Union{Missing, Float64}
18	wifi_connectivity_rating			Union{Missing, Float64}
19	value_money_rating			Union{Missing, Float64}
20	recommended			Int64


In [50]:
# df = df[1:1000,:]

df_OR = df[.!(ismissing.(df[!, :content])), :];
df_content = df_OR[:,:content];

In [51]:
df_ratings_data = df[.!(ismissing.(df[!, :overall_rating])), :];
df_ratings = df_ratings_data[:,:overall_rating]
println("Rating Mean: ",mean(df_ratings))
println("Rating Median: ",median(df_ratings))
df_ratings_mean = abs.(df_ratings .- mean(df_ratings));
df_ratings_median = abs.(df_ratings .- median(df_ratings));

Rating Mean: 6.039526871218904
Rating Median: 7.0


In [52]:
# Create wordlist with all words in text reviews
wordlist = String[]
for line in df_content
    words = split(line, r"\W")
    map(w -> push!(wordlist, lowercase(w)), words)
end
filter!(!isempty, wordlist)

4923786-element Array{String,1}:
 "outbound"   
 "flight"     
 "fra"        
 "prn"        
 "a319"       
 "2"          
 "hours"      
 "10"         
 "min"        
 "flight"     
 "i"          
 "thought"    
 "drinks"     
 ⋮            
 "at"         
 "the"        
 "gate"       
 "no"         
 "explanation"
 "offered"    
 "no"         
 "drinks"     
 "available"  
 "on"         
 "the"        
 "flight"     

In [53]:
# Create dictionary that counts number of occurances for each word
wordcounts = Dict{String,Int64}()
for word in wordlist
    wordcounts[word]=get(wordcounts, word, 0) + 1
end

# Sort to find most common words
common_words = sort(collect(wordcounts), by = tuple -> last(tuple), rev=true)

36721-element Array{Pair{String,Int64},1}:
           "the" => 231613
           "and" => 154486
            "to" => 151789
           "was" => 107931
             "a" => 98839 
             "i" => 81266 
            "on" => 78689 
            "in" => 77581 
        "flight" => 69612 
            "of" => 62870 
           "for" => 54457 
          "with" => 49020 
          "were" => 48286 
                 ⋮        
          "0610" => 1     
         "ey431" => 1     
        "nagged" => 1     
       "roaches" => 1     
 "canelécookies" => 1     
      "smuggled" => 1     
  "germandwings" => 1     
         "ul318" => 1     
          "weez" => 1     
       "leavers" => 1     
          "3276" => 1     
       "chárter" => 1     

In [54]:
num_lines = 500

for i in 1:1:num_lines
    println(common_words[i])
end

"the" => 231613
"and" => 154486
"to" => 151789
"was" => 107931
"a" => 98839
"i" => 81266
"on" => 78689
"in" => 77581
"flight" => 69612
"of" => 62870
"for" => 54457
"with" => 49020
"were" => 48286
"we" => 43485
"not" => 36874
"is" => 34762
"it" => 33726
"but" => 33457
"at" => 33424
"that" => 32117
"from" => 31275
"they" => 28756
"had" => 28123
"very" => 27921
"my" => 27521
"no" => 25476
"as" => 25204
"this" => 24874
"have" => 24100
"service" => 23420
"time" => 22263
"good" => 21046
"food" => 20408
"seats" => 19629
"you" => 19433
"all" => 18573
"be" => 18218
"t" => 17516
"seat" => 17094
"an" => 16866
"are" => 16808
"flights" => 16656
"crew" => 16239
"staff" => 15878
"our" => 15854
"so" => 15781
"would" => 14457
"there" => 14382
"only" => 14134
"plane" => 13697
"class" => 13083
"one" => 13023
"cabin" => 12923
"return" => 12760
"which" => 12663
"again" => 12597
"airline" => 12549
"check" => 12093
"us" => 11887
"or" => 11875
"when" => 11835
"by" => 11625
"me" => 11455
"their" => 11414
"hour

In [55]:

"good" "first" "entertainment" "friendly" "new" "comfortable" "great" "delayed" "excellent" "nice" "late" "better" "old" "offered"
"free" "delay" "helpful" "early" "pleasant" "best" "available" 

efficient, recommend, uncomfortable, cancelled, comfort, problems, delays, UNITED, cheap, awful

LoadError: syntax: extra token """ after end of expression

In [56]:
# Bag of words function to add bag of word features to data 

bag_of_words = ["good",
                "first",
                "entertainment",
                "friendly",
                "new",
                "comfortable",
                "great",
                "delayed",
                "excellent",
                "nice",
                "late",
                "better",
                "offered",
                "free",
                "delay",
                "helpful",
                "early",
                "pleasant",
                "best",
                "available",
                "efficient",
                "recommend",
                "uncomfortable",
                "cancelled",
                "comfort",
                "problems",
                "delays",
                "united",
                "cheap",
                "awful"
                ]

text_features = ["service",
                 "time",
                 "food",
                 "seats",
                 "seat",
                 "crew",
                 "staff",
                 "cabin",
                 "entertainment",
                 "boarding",
                 "leg"
                 ]

"Computes a manyhot vector for every entry in column given a bag of words"
function manyhot(column, words=bag_of_words)
    result = zeros(size(column,1), size(words,1))
    for i = 1:size(column,1)
        for j in 1:size(words,1)
            if occursin(words[j],column[i])
                result[i,j] = Int(1)
            end
        end
    end
    convert(Array{Int64,2},result)
end

manyhot

In [57]:
# Reduce number of datapoints for computation time

# Filter data to only consider entries with overall rating
df_OR = df[.!(ismissing.(df[!, :overall_rating])), :];

# df_OR_filtered = df_OR[.!(ismissing.(df_OR[!, :seat_comfort_rating])), :]; 
# data_OR_filtered = df_OR_filtered[.!(ismissing.(df_OR_filtered[!, :cabin_staff_rating])), :];
# data_OR_filtered = data_OR_filtered[.!(ismissing.(df_OR_filtered[!, :content])), :];

# # Now adding food_beverages_rating, inflight_entertainment_rating, and value money rating
# data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :food_beverages_rating])), :];
# data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :inflight_entertainment_rating])), :];
# data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :value_money_rating])), :];

train_proportion = 0.8
n = size(data_OR_filtered, 1)
println("Size of dataset: ", string(n))

# Put the first ntrain observations in the DataFrame df into the training set, and the rest into the test set
ntrain = convert(Int, round(train_proportion*n))
println("Size of train: ", string(round(train_proportion*n)))
println("Size of test: ", string(round(n-train_proportion*n)))

# Create manyhot bag of words binary array and add feature columns to original airline dataframe
df_content = data_OR_filtered[:, :content]
manyhot_data = manyhot(df_content)
for i = 1:size(text_features,1)
    data_OR_filtered[!,Symbol(text_features[i])] = manyhot_data[:,i]
end



# data_OR_filtered = data_OR_filtered[.!(ismissing.(data_OR_filtered[!, :value_money_rating])), :];

# Get mean and median ratings

target = abs.(data_OR_filtered[:, :overall_rating] .- mean(df_ratings));
# target = abs.(data_OR_filtered[:, :overall_rating] .- median(df_ratings));


# Filter data for entries with only seat_comfort, cabin_staff
# df_OR_filtered = df_OR[.!(ismissing.(df[!, :seat_comfort_rating])), :]; 
# df_OR_filtered = df_OR[.!(ismissing.(df[!, :cabin_staff_rating])), :]; 
# data_OR_filtered = df_OR_filtered[:, filter(col -> (col != :overall_rating), feature_names)]

# the following variable records the features of examples in the training set
train_x = data_OR_filtered[1:ntrain,:]

# the following variable records the features of examples in the test set
test_x = data_OR_filtered[ntrain+1:end,:]

# the following variable records the labels of examples in the training set
train_y = target[1:ntrain]
train_y = collect(skipmissing(train_y))
# the following variable records the labels of examples in the test set
test_y = target[ntrain+1:end]
test_y = collect(skipmissing(test_y));

Size of dataset: 28341
Size of train: 22673.0
Size of test: 5668.0


In [58]:
manyhot_data

28341×30 Array{Int64,2}:
 0  0  0  1  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 1  0  0  0  0  1  0  0  0  1  0  0  0     0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  1  1  0  0  0  0  1  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 1  0  0  0  1  1  0  0  0  0  0  0  0     0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  1  0  0  0  0  0  0  0  0  1     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  1  1  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  1  0  0  0  0  0  0  0  0  0     0  0  1  0  0  0  0  0  0  0  0  0
 0  0  0  1  0  1  0  0  1  1  0  0  0     0  0  0  0  0  0  1  0  0  0  0  0
 1  0  0  0  0  0  1  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  1  0  1  0  0  0  0  0  0  0     0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  0  1  1  0  0  0  0  0  0  0  …  0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  1     0  0  0  0  0  0  0  0  0  0  0  0
 1  0  0  0  0  1  0  0  0  1  0  0  1 

In [59]:
"""This function just computes the mean squared error."""
function MSE(y, pred)
    "Fill this in."
    num = size(y,1)
    error = sum(abs.((y.-pred).^2))/num
    return error
end

"""This function plots the main diagonal; 
for a "predicted vs true" plot with perfect predictions,
all data lies on this line"""
function plotDiagonal(xmin, xmax)
    xsamples = [xmin, xmax]
    plot!(xsamples, xsamples, color=:black)
end

"""This helper funciton plots x vs, y and labels the axes."""
function plotdata(x,y,xname, yname; margin=.05, plotDiag=true, zeromin=false)
    scatter(x,y, label="data")
    xlabel!(xname)
    ylabel!(yname)
    range_y = maximum(y) - minimum(y)
    range_x = maximum(x) - minimum(x)
    if plotDiag
        plotDiagonal(minimum(x)-margin*range_x, maximum(x)+margin*range_x)
    end
    if zeromin
        ylims!((0.0,maximum(y)+margin*range_y))
        xlims!((0.0,maximum(x)+margin*range_x))
    else
        ylims!((minimum(y)-margin*range_y,maximum(y)+margin*range_y))
        xlims!((minimum(x)-margin*range_x,maximum(x)+margin*range_x))
    end
end

"""This function plots the predicted labels vs the actual labels
(We only plots the first 1000 points to avoid slow plots.)"""
function plot_pred_true(test_pred, test_y, max_points = 1000)
    plotdata(test_pred[1:max_points], test_y[1:max_points], "Predicted Rating", "True Rating", zeromin=true)
end

plot_pred_true

In [60]:
"This function converts strings to floating point values.
Strings that cannot be represented as a number (like NA) are converted to zeros"
function string_to_float(str)
    try
        parse(Float64, str)
    catch
       0.0
    end
end

labels_all = [
  :overall_rating,
  :seat_comfort_rating,
  :cabin_staff_rating,
  :food_beverages_rating,
  :inflight_entertainment_rating,
  :ground_service_rating,
  :wifi_connectivity_rating,
  :value_money_rating,
  :recommended
]

labels_real = [
    :seat_comfort_rating,
    :cabin_staff_rating,
    :food_beverages_rating,
    :inflight_entertainment_rating,
    :value_money_rating
]


symbol_words = Symbol[]

for word in text_features
    push!(symbol_words,Symbol(word))
end

labels_all_words = vcat(labels_all,symbol_words);
labels_real_words = vcat(labels_real,symbol_words);

In [61]:
# # Only converting for seat comfort and cabin staff
# train_vals_real = convert(Matrix,train_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating])]);
# test_vals_real = convert(Matrix,test_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating])]);
# size(train_vals_real,1)
# any(ismissing,train_vals_real)

# Only converting for seat comfort, cabin staff, food/beverage, and inflight entertainment
# train_vals_real = convert(Matrix,train_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating,:food_beverages_rating,:inflight_entertainment_rating,:value_money_rating])]);
# test_vals_real = convert(Matrix,test_x[:,filter(col -> (col in labels_real), [:seat_comfort_rating,:cabin_staff_rating,:food_beverages_rating,:inflight_entertainment_rating,:value_money_rating])]);
# size(train_vals_real,1)
# any(ismissing,train_vals_real)

# Only converting for seat comfort, cabin staff, food/beverage, and inflight entertainment AND BAG OF WORDS
train_vals_real = convert(Matrix,train_x[:,filter(col -> (col in symbol_words), labels_all_words)])
test_vals_real = convert(Matrix,test_x[:,filter(col -> (col in symbol_words), labels_all_words)])
size(train_vals_real,1)
any(ismissing,train_vals_real)

train_vals_real = convert(Array{Float64,2},train_vals_real)
test_vals_real = convert(Array{Float64,2},test_vals_real)

# # Add offset
train_vals_real = [train_vals_real ones(size(train_vals_real,1))];
test_vals_real = [test_vals_real ones(size(test_vals_real,1))];

[train_vals_real train_y]

22673×13 Array{Float64,2}:
 0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.960473 
 1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  3.96047  
 0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  2.96047  
 1.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  1.96047  
 0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  2.03953  
 0.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  2.96047  
 0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.03953  
 0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0  1.0  2.96047  
 1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  1.96047  
 0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  3.96047  
 0.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  2.96047  
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.960473 
 1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  1.96047  
 ⋮                        ⋮                       

In [62]:
loss_L2 = 1/n*L1Loss()
λ = 0
# Quad Regularizer
reg = QuadReg(λ)

w_L2 = proxgrad(loss_L2, reg, train_vals_real, df_ratings_mean, maxiters=100)
train_pred = train_vals_real*w_L2
test_pred = test_vals_real*w_L2
train_MSE = MSE(train_pred,train_y)
test_MSE = MSE(test_pred,test_y)

println("Train MSE:\t", train_MSE)
println("Test MSE: \t", test_MSE)

Train MSE:	2.087581973260096
Test MSE: 	2.273849728445067


In [17]:
text = join(df_content)
sd1 = StringDocument(text)
ngams_dict = ngrams(sd1,2)

Dict{AbstractString,Int64} with 853060 entries:
  "plain look"              => 1
  "time. 737"               => 2
  "to JNU"                  => 1
  "down against"            => 1
  "arranged right."         => 1
  "this there"              => 11
  "Great service.Monarch"   => 1
  "on find"                 => 1
  "understand Economy"      => 1
  "them.Appalling customer" => 1
  "seats. Couldn"           => 1
  "quiet seating"           => 1
  "wonderful Taiwanese"     => 1
  "humoured crew."          => 1
  "much does"               => 1
  "experience outstanding." => 2
  "understood how"          => 2
  "Köln these"              => 1
  "fare. Let"               => 1
  "hour Police"             => 1
  "nice tilapia"            => 1
  "my item"                 => 1
  "London-Tashkent in"      => 1
  "impossible on"           => 1
  "choose.Had"              => 1
  ⋮                         => ⋮

In [18]:
sorted_dict = sort(collect(ngams_dict), by=x->x[2],rev=true)

853060-element Array{Pair{AbstractString,Int64},1}:
                 "the" => 136864
                 "and" => 110196
                  "to" => 101919
                 "was" => 77998 
                   "a" => 67507 
                   "I" => 55243 
                  "on" => 50808 
                  "in" => 47413 
                  "of" => 44517 
                 "for" => 35914 
              "flight" => 34949 
                "with" => 34511 
                "were" => 34310 
                       ⋮        
    "nice.PHL-SDF and" => 1     
   "lavatories broken" => 1     
     "expedited. Gate" => 1     
  "delightful. Geneva" => 1     
            "' token." => 1     
  "Tegucigalpa better" => 1     
           "used.Flew" => 1     
              "to GDN" => 1     
 "business broadsheet" => 1     
  "money.FCO-LHR-YVR." => 1     
      "Indian Chinese" => 1     
          "vision for" => 1     

In [19]:
two_sorted_dict = filter(p -> occursin(" ",first(p)),sorted_dict)

778070-element Array{Pair{AbstractString,Int64},1}:
                 "' t" => 11947
              "on the" => 11255
              "of the" => 10270
              "in the" => 8242 
             "and the" => 8038 
              "to the" => 6876 
          "the flight" => 6872 
               "I was" => 6712 
                 "' s" => 6481 
               "was a" => 5776 
             "for the" => 5666 
              "it was" => 5347 
               "to be" => 5338 
                       ⋮       
               "SOF )" => 1    
        "As customers" => 1    
    "nice.PHL-SDF and" => 1    
   "lavatories broken" => 1    
     "expedited. Gate" => 1    
  "delightful. Geneva" => 1    
            "' token." => 1    
  "Tegucigalpa better" => 1    
              "to GDN" => 1    
 "business broadsheet" => 1    
      "Indian Chinese" => 1    
          "vision for" => 1    

In [20]:
num_lines = 500

for i in 1:num_lines
    println(two_sorted_dict[i])
end

Pair{AbstractString,Int64}("' t", 11947)
Pair{AbstractString,Int64}("on the", 11255)
Pair{AbstractString,Int64}("of the", 10270)
Pair{AbstractString,Int64}("in the", 8242)
Pair{AbstractString,Int64}("and the", 8038)
Pair{AbstractString,Int64}("to the", 6876)
Pair{AbstractString,Int64}("the flight", 6872)
Pair{AbstractString,Int64}("I was", 6712)
Pair{AbstractString,Int64}("' s", 6481)
Pair{AbstractString,Int64}("was a", 5776)
Pair{AbstractString,Int64}("for the", 5666)
Pair{AbstractString,Int64}("it was", 5347)
Pair{AbstractString,Int64}("to be", 5338)
Pair{AbstractString,Int64}("flight was", 4996)
Pair{AbstractString,Int64}("at the", 4978)
Pair{AbstractString,Int64}("and I", 4693)
Pair{AbstractString,Int64}(") .", 4666)
Pair{AbstractString,Int64}("we were", 4471)
Pair{AbstractString,Int64}("for a", 4402)
Pair{AbstractString,Int64}("I have", 4246)
Pair{AbstractString,Int64}("had to", 3970)
Pair{AbstractString,Int64}("I had", 3960)
Pair{AbstractString,Int64}("on time", 3956)
Pair{Abstra

Pair{AbstractString,Int64}("Business Class", 1054)
Pair{AbstractString,Int64}("2 hours", 1053)
Pair{AbstractString,Int64}("and very", 1050)
Pair{AbstractString,Int64}("front of", 1043)
Pair{AbstractString,Int64}("and not", 1042)
Pair{AbstractString,Int64}("comfortable and", 1039)
Pair{AbstractString,Int64}("There was", 1038)
Pair{AbstractString,Int64}("on an", 1038)
Pair{AbstractString,Int64}("this is", 1037)
Pair{AbstractString,Int64}("there were", 1037)
Pair{AbstractString,Int64}("A 320", 1036)
Pair{AbstractString,Int64}("I flew", 1032)
Pair{AbstractString,Int64}("I can", 1027)
Pair{AbstractString,Int64}("Cabin crew", 1018)
Pair{AbstractString,Int64}("the new", 1014)
Pair{AbstractString,Int64}("you are", 1013)
Pair{AbstractString,Int64}("would not", 1012)
Pair{AbstractString,Int64}("The only", 1012)
Pair{AbstractString,Int64}("clean and", 1004)
Pair{AbstractString,Int64}("2014 .", 998)
Pair{AbstractString,Int64}("as a", 995)
Pair{AbstractString,Int64}("during the", 988)
Pair{Abstract

Pair{AbstractString,Int64}("3 hours", 667)
Pair{AbstractString,Int64}("the plane.", 665)
Pair{AbstractString,Int64}("we would", 664)
Pair{AbstractString,Int64}("Flight attendants", 660)
Pair{AbstractString,Int64}("a nice", 659)
Pair{AbstractString,Int64}("for this", 657)
Pair{AbstractString,Int64}("is no", 655)
Pair{AbstractString,Int64}("B 777", 655)
Pair{AbstractString,Int64}("my seat", 652)
Pair{AbstractString,Int64}("Staff were", 650)
Pair{AbstractString,Int64}("the extra", 648)
Pair{AbstractString,Int64}("When I", 643)
Pair{AbstractString,Int64}("a short", 642)
Pair{AbstractString,Int64}("the flights", 641)
Pair{AbstractString,Int64}("nice and", 641)
Pair{AbstractString,Int64}("not be", 641)
Pair{AbstractString,Int64}("I didn", 640)
Pair{AbstractString,Int64}("to see", 639)
Pair{AbstractString,Int64}("them again", 639)
Pair{AbstractString,Int64}("the entertainment", 638)
Pair{AbstractString,Int64}("and return", 638)
Pair{AbstractString,Int64}("told that", 636)
Pair{AbstractString,

In [21]:
list1 = ["one","one two"]
list2 = [6,7]
dictionary1 = Dict(zip(list1,list2))

Dict{String,Int64} with 2 entries:
  "one two" => 7
  "one"     => 6

In [22]:
two_sorted_dict = filter(p -> occursin(" ",first(p)),dictionary1)

Dict{String,Int64} with 1 entry:
  "one two" => 7

In [23]:
text = "look at"
occursin(" ",text)

true